- {/* Pipeline */}
+ {/* Section A — Pipeline */}
{t.rightPanelTitle}
@@ -359,46 +316,18 @@ function RightPanel({ t }) {
- {/* Stats */}
+ {/* Section B — Stats (flex-1 to fill remaining space) */}
-
- {/* Live Feed */}
-
-
- 📊 {t.detectionTitle}
-
-
-
-
- {feed.map((item, idx) => (
-
- {getTimestamp()}
-
- {item.type}
-
- {item.info}
-
- ))}
-
-
-
);
}
diff --git a/frontend/src/utils/commonTranslations.js b/frontend/src/utils/commonTranslations.js
new file mode 100644
index 0000000..d15cc17
--- /dev/null
+++ b/frontend/src/utils/commonTranslations.js
@@ -0,0 +1,30 @@
+/**
+ * Common translations shared across internal pages.
+ * Usage: const tc = commonTranslations[language] ?? commonTranslations['en-US'];
+ */
+export const commonTranslations = {
+ 'pt-BR': {
+ backToHome: 'Voltar para Home',
+ loading: 'Carregando...',
+ error: 'Erro',
+ success: 'Sucesso',
+ cancel: 'Cancelar',
+ confirm: 'Confirmar',
+ save: 'Salvar',
+ download: 'Baixar',
+ upload: 'Enviar arquivo',
+ reset: 'Reiniciar',
+ },
+ 'en-US': {
+ backToHome: 'Back to Home',
+ loading: 'Loading...',
+ error: 'Error',
+ success: 'Success',
+ cancel: 'Cancel',
+ confirm: 'Confirm',
+ save: 'Save',
+ download: 'Download',
+ upload: 'Upload file',
+ reset: 'Reset',
+ },
+};
diff --git a/tests/frontend/unit/HomePage.test.js b/tests/frontend/unit/HomePage.test.js
index 1101916..427754d 100644
--- a/tests/frontend/unit/HomePage.test.js
+++ b/tests/frontend/unit/HomePage.test.js
@@ -28,6 +28,9 @@ jest.mock('lucide-react', () => ({
MessageSquare: () =>
MessageSquare
,
Eye: () =>
Eye
,
GitBranch: () =>
GitBranch
,
+ LogOut: () =>
LogOut
,
+ Heart: () =>
Heart
,
+ Languages: () =>
Languages
,
}));
// Mock framer-motion to avoid animation issues in tests
@@ -70,30 +73,62 @@ jest.mock('../../../frontend/src/components/RAGButton', () => {
};
});
+// Mock LanguageToggle
+jest.mock('../../../frontend/src/components/LanguageToggle', () =>
+ function MockLanguageToggle() {
+ return
LanguageToggle
;
+ }
+);
+
+// Mock useAuth
+const mockHandleLogout = jest.fn();
+jest.mock('../../../frontend/src/hooks/useAuth', () => () => ({
+ handleLogout: mockHandleLogout,
+}));
+
+// Mock useAuthContext
+jest.mock('../../../frontend/src/context/AuthContext', () => ({
+ useAuthContext: () => ({
+ user: { name: 'Test User', avatar: 'TU', role: 'tester' },
+ isAuthenticated: true,
+ hasProfile: true,
+ isLoading: false,
+ }),
+}));
+
+// useLanguage mock — default PT-BR; individual tests can override via mockLanguageState
+const mockChangeLanguage = jest.fn();
+const mockLanguageState = { current: 'pt-BR' };
+jest.mock('../../../frontend/src/context/LanguageContext', () => ({
+ useLanguage: () => ({
+ language: mockLanguageState.current,
+ changeLanguage: mockChangeLanguage,
+ }),
+}));
+
// Helper function to render component with router
const renderWithRouter = (component) => {
return render(
{component} );
};
describe('HomePage Component', () => {
+ beforeEach(() => {
+ jest.clearAllMocks();
+ mockLanguageState.current = 'pt-BR';
+ });
+
describe('Initial Render', () => {
test('renders main heading', () => {
renderWithRouter(
);
-
+
// HomePage contains DataForge text
const dataforgeElements = screen.queryAllByText(/DataForge/i);
expect(dataforgeElements.length).toBeGreaterThan(0);
});
- test('renders main description', () => {
- renderWithRouter(
);
-
- expect(screen.getByText(/Big Data Quality Testing/i)).toBeInTheDocument();
- });
-
test('renders RAG button', () => {
renderWithRouter(
);
-
+
// RAGButton should be rendered as mocked component
const ragButton = screen.getByTestId('rag-button');
expect(ragButton).toBeInTheDocument();
@@ -103,7 +138,7 @@ describe('HomePage Component', () => {
describe('Navigation Links', () => {
test('has link to QA Checklist', () => {
renderWithRouter(
);
-
+
const links = screen.getAllByRole('link');
const qaLink = links.find(link => link.getAttribute('href') === '/checklist');
expect(qaLink).toBeTruthy();
@@ -111,7 +146,7 @@ describe('HomePage Component', () => {
test('has link to Generate Dataset', () => {
renderWithRouter(
);
-
+
const links = screen.getAllByRole('link');
const generateLink = links.find(link => link.getAttribute('href') === '/generate-dataset');
expect(generateLink).toBeTruthy();
@@ -121,28 +156,28 @@ describe('HomePage Component', () => {
describe('Feature Sections', () => {
test('displays data quality features', () => {
renderWithRouter(
);
-
+
// Check that the component renders without errors
expect(document.body).toBeInTheDocument();
});
test('displays schema validation features', () => {
renderWithRouter(
);
-
+
// Check that the component renders
expect(document.body).toBeInTheDocument();
});
test('displays streaming features', () => {
renderWithRouter(
);
-
+
const streamingText = screen.queryAllByText(/Streaming/i);
expect(streamingText.length).toBeGreaterThanOrEqual(0);
});
test('displays integration features', () => {
renderWithRouter(
);
-
+
const integrationText = screen.queryAllByText(/Integration/i);
expect(integrationText.length).toBeGreaterThanOrEqual(0);
});
@@ -151,7 +186,7 @@ describe('HomePage Component', () => {
describe('LLM Workflow Section', () => {
test('displays LLM workflow steps', () => {
renderWithRouter(
);
-
+
// Just check the component renders
expect(document.body).toBeInTheDocument();
});
@@ -160,7 +195,7 @@ describe('HomePage Component', () => {
describe('Interactive Features', () => {
test('allows switching between structure views', () => {
renderWithRouter(
);
-
+
// Just check the component renders interactable elements
const buttons = screen.getAllByRole('button');
expect(buttons.length).toBeGreaterThan(0);
@@ -168,7 +203,7 @@ describe('HomePage Component', () => {
test('allows switching between feature sections', () => {
renderWithRouter(
);
-
+
// Check component renders
expect(document.body).toBeInTheDocument();
});
@@ -181,17 +216,133 @@ describe('HomePage Component', () => {
test('has gradient background classes', () => {
renderWithRouter(
);
-
+
const gradientElements = document.querySelectorAll('[class*="gradient"]');
expect(gradientElements.length).toBeGreaterThan(0);
});
test('displays hero section', () => {
renderWithRouter(
);
-
+
// Check main heading is present
const dataforgeElements = screen.queryAllByText(/DataForge/i);
expect(dataforgeElements.length).toBeGreaterThan(0);
});
});
+
+ // ---------------------------------------------------------------------------
+ // AJUSTE 2 — HomeHeader
+ // ---------------------------------------------------------------------------
+ describe('HomeHeader', () => {
+ test('renders user avatar with correct initials', () => {
+ renderWithRouter(
);
+ expect(screen.getByText('TU')).toBeInTheDocument();
+ });
+
+ test('renders user name', () => {
+ renderWithRouter(
);
+ expect(screen.getByText('Test User')).toBeInTheDocument();
+ });
+
+ test('renders logout button', () => {
+ renderWithRouter(
);
+ // Logout button should be present (title "Sair" in PT-BR)
+ const logoutBtn = screen.getByTitle(/Sair|Logout/i);
+ expect(logoutBtn).toBeInTheDocument();
+ });
+
+ test('calls handleLogout when logout button clicked', () => {
+ renderWithRouter(
);
+ const logoutBtn = screen.getByTitle(/Sair|Logout/i);
+ fireEvent.click(logoutBtn);
+ expect(mockHandleLogout).toHaveBeenCalled();
+ });
+
+ test('renders navigation links in header', () => {
+ renderWithRouter(
);
+ const links = screen.getAllByRole('link');
+ const methodologyLink = links.find(l => l.getAttribute('href') === '/methodology');
+ const checklistLink = links.find(l => l.getAttribute('href') === '/checklist');
+ const generateLink = links.find(l => l.getAttribute('href') === '/generate-dataset');
+ expect(methodologyLink).toBeTruthy();
+ expect(checklistLink).toBeTruthy();
+ expect(generateLink).toBeTruthy();
+ });
+
+ test('renders LanguageToggle in header', () => {
+ renderWithRouter(
);
+ expect(screen.getByTestId('language-toggle')).toBeInTheDocument();
+ });
+ });
+
+ // ---------------------------------------------------------------------------
+ // AJUSTE 3 — i18n Translations
+ // ---------------------------------------------------------------------------
+ describe('Translations', () => {
+ test('renders hero title in PT-BR', () => {
+ mockLanguageState.current = 'pt-BR';
+ renderWithRouter(
);
+ expect(screen.getByText(/Testes de Qualidade/i)).toBeInTheDocument();
+ });
+
+ test('renders hero title in EN-US', () => {
+ mockLanguageState.current = 'en-US';
+ renderWithRouter(
);
+ expect(screen.getByText(/Big Data Quality Testing/i)).toBeInTheDocument();
+ });
+
+ test('renders navigation links with translated labels in PT-BR', () => {
+ mockLanguageState.current = 'pt-BR';
+ renderWithRouter(
);
+ // Use queryAllByText to handle multiple occurrences (header + buttons)
+ expect(screen.queryAllByText(/Metodologia/i).length).toBeGreaterThan(0);
+ expect(screen.queryAllByText(/Checklist QA/i).length).toBeGreaterThan(0);
+ });
+
+ test('renders footer copyright in PT-BR', () => {
+ mockLanguageState.current = 'pt-BR';
+ renderWithRouter(
);
+ expect(screen.getByText(/Todos os direitos reservados/i)).toBeInTheDocument();
+ });
+
+ test('renders footer copyright in EN-US', () => {
+ mockLanguageState.current = 'en-US';
+ renderWithRouter(
);
+ expect(screen.getByText(/All rights reserved/i)).toBeInTheDocument();
+ });
+ });
+
+ // ---------------------------------------------------------------------------
+ // AJUSTE 4 — HomeFooter
+ // ---------------------------------------------------------------------------
+ describe('HomeFooter', () => {
+ test('renders footer element', () => {
+ renderWithRouter(
);
+ expect(document.querySelector('footer')).toBeInTheDocument();
+ });
+
+ test('renders copyright text in PT-BR', () => {
+ mockLanguageState.current = 'pt-BR';
+ renderWithRouter(
);
+ expect(screen.getByText(/Todos os direitos reservados/i)).toBeInTheDocument();
+ });
+
+ test('renders copyright text in EN-US', () => {
+ mockLanguageState.current = 'en-US';
+ renderWithRouter(
);
+ expect(screen.getByText(/All rights reserved/i)).toBeInTheDocument();
+ });
+
+ test('renders tech stack info', () => {
+ renderWithRouter(
);
+ // footer tech string contains "React" and "PySpark"
+ expect(screen.queryAllByText(/React/i).length).toBeGreaterThan(0);
+ expect(screen.queryAllByText(/PySpark/i).length).toBeGreaterThan(0);
+ });
+
+ test('renders version string', () => {
+ renderWithRouter(
);
+ expect(screen.getByText(/v1\.0\.0/i)).toBeInTheDocument();
+ });
+ });
});
diff --git a/tests/frontend/unit/LoginPage.test.js b/tests/frontend/unit/LoginPage.test.js
index 668de65..38d7e33 100644
--- a/tests/frontend/unit/LoginPage.test.js
+++ b/tests/frontend/unit/LoginPage.test.js
@@ -207,3 +207,12 @@ describe('LoginPage — Error display', () => {
expect(elements.length).toBeGreaterThan(0);
});
});
+
+describe('LoginPage — Right Panel', () => {
+ test('right panel does not contain live detection feed', () => {
+ renderLoginPage();
+ // "Detecções" / "Detections" should not appear — feed was removed
+ expect(screen.queryByText(/Detecções/i)).not.toBeInTheDocument();
+ expect(screen.queryByText(/Detections/i)).not.toBeInTheDocument();
+ });
+});
From 2c0bffb7d65bd57b3495636c8f3383585dcb8e48 Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 00:00:58 -0300
Subject: [PATCH 09/17] feat(login): replace hardcoded stats with dynamic
/api/stats endpoint and useStats hook
---
frontend/src/hooks/useStats.js | 60 +++++++++++
frontend/src/pages/LoginPage.js | 11 ++-
src/api.py | 56 +++++++++++
tests/frontend/unit/LoginPage.profile.test.js | 7 ++
tests/frontend/unit/LoginPage.test.js | 7 ++
tests/frontend/unit/useStats.test.js | 99 +++++++++++++++++++
6 files changed, 236 insertions(+), 4 deletions(-)
create mode 100644 frontend/src/hooks/useStats.js
create mode 100644 tests/frontend/unit/useStats.test.js
diff --git a/frontend/src/hooks/useStats.js b/frontend/src/hooks/useStats.js
new file mode 100644
index 0000000..a4291b3
--- /dev/null
+++ b/frontend/src/hooks/useStats.js
@@ -0,0 +1,60 @@
+/**
+ * useStats — fetches live platform stats from GET /api/stats.
+ *
+ * Returns formatted strings ready for use in StatCard:
+ * tests → "971+" (total test count)
+ * datasets → "1180+" (files in storage/)
+ * coverage → "86%" (line coverage from cobertura XML)
+ * responseSla → "<2s" (SLA from performance benchmarks)
+ *
+ * Falls back to last-known values when the API is unreachable (e.g. dev offline).
+ */
+
+import { useEffect, useState } from 'react';
+import { getApiUrl } from '../config/api';
+
+// Last-known baselines used while loading or when the API fails
+const FALLBACK = {
+ tests: '970+',
+ datasets: '1180+',
+ coverage: '86%',
+ responseSla: '<2s',
+};
+
+export default function useStats() {
+ const [stats, setStats] = useState(FALLBACK);
+
+ useEffect(() => {
+ let cancelled = false;
+
+ const fetchStats = async () => {
+ try {
+ const res = await fetch(getApiUrl('/api/stats'), {
+ method: 'GET',
+ headers: { 'Content-Type': 'application/json' },
+ // Short timeout — login page must not stall for stats
+ signal: AbortSignal.timeout ? AbortSignal.timeout(4000) : undefined,
+ });
+ if (!res.ok) return;
+ const data = await res.json();
+ if (cancelled) return;
+
+ setStats({
+ tests: `${data.tests_total}+`,
+ datasets: `${data.datasets_total}+`,
+ coverage: `${data.coverage_pct}%`,
+ responseSla: data.response_sla_ms < 1000
+ ? `<${data.response_sla_ms}ms`
+ : `<${data.response_sla_ms / 1000}s`,
+ });
+ } catch {
+ // Network error or timeout — silently keep fallback values
+ }
+ };
+
+ fetchStats();
+ return () => { cancelled = true; };
+ }, []);
+
+ return stats;
+}
diff --git a/frontend/src/pages/LoginPage.js b/frontend/src/pages/LoginPage.js
index 6620f5e..1702288 100644
--- a/frontend/src/pages/LoginPage.js
+++ b/frontend/src/pages/LoginPage.js
@@ -27,6 +27,7 @@ import { useAuthContext } from '../context/AuthContext';
import { useLanguage } from '../context/LanguageContext';
import LanguageToggle from '../components/LanguageToggle';
import useAuth from '../hooks/useAuth';
+import useStats from '../hooks/useStats';
import {
floatingNode,
popIn,
@@ -266,6 +267,8 @@ function StatCard({ icon: Icon, label, value, color }) {
}
function RightPanel({ t }) {
+ const { tests, datasets, coverage, responseSla } = useStats();
+
return (
{/* Section A — Pipeline */}
@@ -323,10 +326,10 @@ function RightPanel({ t }) {
variants={{ animate: { transition: { staggerChildren: 0.1 } } }}
className="grid grid-cols-2 gap-3 flex-1"
>
-
-
-
-
+
+
+
+
);
diff --git a/src/api.py b/src/api.py
index db96fcc..d4566be 100644
--- a/src/api.py
+++ b/src/api.py
@@ -105,6 +105,62 @@ def add_security_headers(response):
# Continue registering other blueprints even if one fails
+@app.route("/api/stats", methods=["GET"])
+def platform_stats():
+ """Return live platform stats used by the login page dashboard."""
+ import re
+ from pathlib import Path
+
+ base = Path(__file__).resolve().parent.parent # workspace root
+
+ # ── Backend tests: count `def test_` functions ──────────────────────────
+ backend_tests = 0
+ for f in base.glob("tests/backend/**/*.py"):
+ try:
+ content = f.read_text(encoding="utf-8", errors="ignore")
+ backend_tests += len(re.findall(r"^\s*def test_", content, re.MULTILINE))
+ except OSError:
+ pass
+
+ # ── Frontend tests: count test( / it( calls ──────────────────────────────
+ frontend_tests = 0
+ for f in base.glob("tests/frontend/**/*.test.js"):
+ try:
+ content = f.read_text(encoding="utf-8", errors="ignore")
+ frontend_tests += len(re.findall(r"(?:^|\s)(?:test|it)\s*\(", content, re.MULTILINE))
+ except OSError:
+ pass
+
+ total_tests = backend_tests + frontend_tests
+
+ # ── Dataset files in storage ─────────────────────────────────────────────
+ storage_path = base / "storage"
+ dataset_count = 0
+ if storage_path.exists():
+ dataset_count = sum(1 for p in storage_path.rglob("*") if p.is_file())
+
+ # ── Coverage from cobertura XML (generated by Jest --coverage) ───────────
+ coverage_pct = 86 # last known baseline
+ coverage_xml = base / "test-results" / "frontend" / "coverage" / "cobertura-coverage.xml"
+ if coverage_xml.exists():
+ try:
+ xml_content = coverage_xml.read_text(encoding="utf-8")
+ m = re.search(r'line-rate="([0-9.]+)"', xml_content)
+ if m:
+ coverage_pct = round(float(m.group(1)) * 100)
+ except OSError:
+ pass
+
+ return jsonify(
+ {
+ "tests_total": total_tests,
+ "datasets_total": dataset_count,
+ "coverage_pct": coverage_pct,
+ "response_sla_ms": 2000,
+ }
+ )
+
+
@app.route("/", methods=["GET"])
def health_check():
"""Health check endpoint to verify API is running."""
diff --git a/tests/frontend/unit/LoginPage.profile.test.js b/tests/frontend/unit/LoginPage.profile.test.js
index 5f091c3..c11f6ae 100644
--- a/tests/frontend/unit/LoginPage.profile.test.js
+++ b/tests/frontend/unit/LoginPage.profile.test.js
@@ -56,6 +56,13 @@ jest.mock('../../../frontend/src/context/LanguageContext', () => ({
jest.mock('../../../frontend/src/hooks/useAuth', () => () => mockUseAuth());
+jest.mock('../../../frontend/src/hooks/useStats', () => () => ({
+ tests: '970+',
+ datasets: '1180+',
+ coverage: '86%',
+ responseSla: '<2s',
+}));
+
jest.mock('framer-motion', () => ({
motion: {
div: ({ children, ...props }) => {children}
,
diff --git a/tests/frontend/unit/LoginPage.test.js b/tests/frontend/unit/LoginPage.test.js
index 38d7e33..9fce122 100644
--- a/tests/frontend/unit/LoginPage.test.js
+++ b/tests/frontend/unit/LoginPage.test.js
@@ -33,6 +33,13 @@ jest.mock('../../../frontend/src/hooks/useAuth', () => () => ({
isLoading: false,
}));
+jest.mock('../../../frontend/src/hooks/useStats', () => () => ({
+ tests: '970+',
+ datasets: '1180+',
+ coverage: '86%',
+ responseSla: '<2s',
+}));
+
// Mock useLanguage
const mockChangeLanguage = jest.fn();
jest.mock('../../../frontend/src/context/LanguageContext', () => ({
diff --git a/tests/frontend/unit/useStats.test.js b/tests/frontend/unit/useStats.test.js
new file mode 100644
index 0000000..54f256e
--- /dev/null
+++ b/tests/frontend/unit/useStats.test.js
@@ -0,0 +1,99 @@
+/**
+ * Tests for frontend/src/hooks/useStats.js
+ */
+
+import { renderHook, waitFor } from '@testing-library/react';
+import '@testing-library/jest-dom';
+
+// Mock getApiUrl
+jest.mock('../../../frontend/src/config/api', () => ({
+ getApiUrl: (path) => `http://localhost:5000${path}`,
+}));
+
+const MOCK_RESPONSE = {
+ tests_total: 971,
+ datasets_total: 1180,
+ coverage_pct: 86,
+ response_sla_ms: 2000,
+};
+
+beforeEach(() => {
+ jest.spyOn(global, 'fetch').mockResolvedValue({
+ ok: true,
+ json: async () => MOCK_RESPONSE,
+ });
+});
+
+afterEach(() => {
+ jest.restoreAllMocks();
+});
+
+import useStats from '../../../frontend/src/hooks/useStats';
+
+describe('useStats', () => {
+ test('returns fallback values on initial render', () => {
+ jest.spyOn(global, 'fetch').mockImplementation(() => new Promise(() => {})); // hanging
+ const { result } = renderHook(() => useStats());
+ // Before fetch resolves, fallback values are returned
+ expect(result.current.tests).toBe('970+');
+ expect(result.current.datasets).toBe('1180+');
+ expect(result.current.coverage).toBe('86%');
+ expect(result.current.responseSla).toBe('<2s');
+ });
+
+ test('updates stats after successful API response', async () => {
+ const { result } = renderHook(() => useStats());
+ await waitFor(() => {
+ expect(result.current.tests).toBe('971+');
+ });
+ expect(result.current.datasets).toBe('1180+');
+ expect(result.current.coverage).toBe('86%');
+ expect(result.current.responseSla).toBe('<2s');
+ });
+
+ test('formats response_sla_ms >= 1000 as seconds', async () => {
+ jest.spyOn(global, 'fetch').mockResolvedValue({
+ ok: true,
+ json: async () => ({ ...MOCK_RESPONSE, response_sla_ms: 2000 }),
+ });
+ const { result } = renderHook(() => useStats());
+ await waitFor(() => expect(result.current.tests).toBe('971+'));
+ expect(result.current.responseSla).toBe('<2s');
+ });
+
+ test('formats response_sla_ms < 1000 as milliseconds', async () => {
+ jest.spyOn(global, 'fetch').mockResolvedValue({
+ ok: true,
+ json: async () => ({ ...MOCK_RESPONSE, response_sla_ms: 500 }),
+ });
+ const { result } = renderHook(() => useStats());
+ await waitFor(() => expect(result.current.tests).toBe('971+'));
+ expect(result.current.responseSla).toBe('<500ms');
+ });
+
+ test('keeps fallback values when fetch throws a network error', async () => {
+ jest.spyOn(global, 'fetch').mockRejectedValue(new Error('Network error'));
+ const { result } = renderHook(() => useStats());
+ // Wait a tick
+ await new Promise((r) => setTimeout(r, 50));
+ expect(result.current.tests).toBe('970+');
+ expect(result.current.datasets).toBe('1180+');
+ });
+
+ test('keeps fallback values when API returns non-ok status', async () => {
+ jest.spyOn(global, 'fetch').mockResolvedValue({ ok: false, json: async () => ({}) });
+ const { result } = renderHook(() => useStats());
+ await new Promise((r) => setTimeout(r, 50));
+ expect(result.current.tests).toBe('970+');
+ });
+
+ test('calls correct API endpoint', async () => {
+ const fetchSpy = jest.spyOn(global, 'fetch');
+ const { result } = renderHook(() => useStats());
+ await waitFor(() => expect(result.current.tests).toBe('971+'));
+ expect(fetchSpy).toHaveBeenCalledWith(
+ 'http://localhost:5000/api/stats',
+ expect.objectContaining({ method: 'GET' })
+ );
+ });
+});
From d18a88bace369d187202670acfff4e78b37ee90c Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 00:01:50 -0300
Subject: [PATCH 10/17] fix remove option menu header
---
frontend/src/components/HomePage.js | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/frontend/src/components/HomePage.js b/frontend/src/components/HomePage.js
index 05278b5..3bffea7 100644
--- a/frontend/src/components/HomePage.js
+++ b/frontend/src/components/HomePage.js
@@ -27,9 +27,8 @@ const DataQualityLLMSystem = () => {
navHome: 'Home',
navMethodology: 'Metodologia',
navChecklist: 'Checklist QA',
- navGenerate: 'Gerar Dataset',
logout: 'Sair',
- heroTitle: 'DataForgeTest\nTestes de Qualidade de Dados Big Data',
+ heroTitle: 'DataForgeTest\nTestes de Qualidade para Big Data',
heroSubtitle: 'Testes avançados de qualidade com métricas, suporte LLM + RAG e\ngeração automatizada de código PySpark',
btnChecklist: 'Checklist Support QA',
btnGenerate: 'Gerar Dataset',
@@ -48,7 +47,6 @@ const DataQualityLLMSystem = () => {
navHome: 'Home',
navMethodology: 'Methodology',
navChecklist: 'QA Checklist',
- navGenerate: 'Generate Dataset',
logout: 'Logout',
heroTitle: 'DataForgeTest\nBig Data Quality Testing',
heroSubtitle: 'Advanced data quality testing with metrics, LLM + RAG support, and\nautomated PySpark code generation',
From c84601c73241c091f42021bcfc5cbcae18f43e19 Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 08:39:22 -0300
Subject: [PATCH 11/17] fix: ci frontend
---
tests/frontend/unit/ProtectedRoute.test.js | 7 ++
tests/frontend/unit/authStorage.test.js | 11 +++
.../frontend/unit/commonTranslations.test.js | 84 +++++++++++++++++++
tests/frontend/unit/useStats.test.js | 16 ++++
4 files changed, 118 insertions(+)
create mode 100644 tests/frontend/unit/commonTranslations.test.js
diff --git a/tests/frontend/unit/ProtectedRoute.test.js b/tests/frontend/unit/ProtectedRoute.test.js
index f916359..33d398d 100644
--- a/tests/frontend/unit/ProtectedRoute.test.js
+++ b/tests/frontend/unit/ProtectedRoute.test.js
@@ -86,4 +86,11 @@ describe('ProtectedRoute', () => {
})
);
});
+
+ test('LoadingScreen shows "Loading..." label when language is en-US', () => {
+ mockUseLanguage.mockReturnValue({ language: 'en-US', changeLanguage: jest.fn() });
+ renderProtectedRoute({ isAuthenticated: false, hasProfile: false, isLoading: true });
+ expect(screen.getByTestId('loading-screen')).toBeInTheDocument();
+ expect(screen.getByText('Loading...')).toBeInTheDocument();
+ });
});
diff --git a/tests/frontend/unit/authStorage.test.js b/tests/frontend/unit/authStorage.test.js
index 11d5d52..146576d 100644
--- a/tests/frontend/unit/authStorage.test.js
+++ b/tests/frontend/unit/authStorage.test.js
@@ -61,6 +61,11 @@ describe('getSession', () => {
expect(getSession()).toBeNull();
});
+ test('returns null and clears key when JSON is malformed', () => {
+ localStorage.setItem(SESSION_KEY, 'not-valid-json{{{');
+ expect(getSession()).toBeNull();
+ });
+
test('returns null when session is expired', () => {
const expired = {
userId: 'user-1',
@@ -123,6 +128,12 @@ describe('saveProfile', () => {
const stored = JSON.parse(localStorage.getItem(SESSION_KEY));
expect(stored.profile).toEqual(profileData);
});
+
+ test('does nothing when there is no active session', () => {
+ // No session in localStorage
+ expect(() => saveProfile({ role: 'tester' })).not.toThrow();
+ expect(localStorage.getItem(SESSION_KEY)).toBeNull();
+ });
});
describe('hasProfile', () => {
diff --git a/tests/frontend/unit/commonTranslations.test.js b/tests/frontend/unit/commonTranslations.test.js
new file mode 100644
index 0000000..41f71fa
--- /dev/null
+++ b/tests/frontend/unit/commonTranslations.test.js
@@ -0,0 +1,84 @@
+/**
+ * Tests for frontend/src/utils/commonTranslations.js
+ */
+
+import { commonTranslations } from '../../../frontend/src/utils/commonTranslations';
+
+const EXPECTED_KEYS = [
+ 'backToHome',
+ 'loading',
+ 'error',
+ 'success',
+ 'cancel',
+ 'confirm',
+ 'save',
+ 'download',
+ 'upload',
+ 'reset',
+];
+
+describe('commonTranslations', () => {
+ test('is exported as a non-null object', () => {
+ expect(commonTranslations).toBeDefined();
+ expect(typeof commonTranslations).toBe('object');
+ expect(commonTranslations).not.toBeNull();
+ });
+
+ test('contains pt-BR locale', () => {
+ expect(commonTranslations).toHaveProperty('pt-BR');
+ });
+
+ test('contains en-US locale', () => {
+ expect(commonTranslations).toHaveProperty('en-US');
+ });
+
+ test.each(EXPECTED_KEYS)(
+ 'pt-BR has non-empty string for key "%s"',
+ (key) => {
+ expect(typeof commonTranslations['pt-BR'][key]).toBe('string');
+ expect(commonTranslations['pt-BR'][key].length).toBeGreaterThan(0);
+ }
+ );
+
+ test.each(EXPECTED_KEYS)(
+ 'en-US has non-empty string for key "%s"',
+ (key) => {
+ expect(typeof commonTranslations['en-US'][key]).toBe('string');
+ expect(commonTranslations['en-US'][key].length).toBeGreaterThan(0);
+ }
+ );
+
+ test('pt-BR and en-US have the same set of keys', () => {
+ const ptKeys = Object.keys(commonTranslations['pt-BR']).sort();
+ const enKeys = Object.keys(commonTranslations['en-US']).sort();
+ expect(ptKeys).toEqual(enKeys);
+ });
+
+ test('pt-BR backToHome is in Portuguese', () => {
+ expect(commonTranslations['pt-BR'].backToHome).toBe('Voltar para Home');
+ });
+
+ test('en-US backToHome is in English', () => {
+ expect(commonTranslations['en-US'].backToHome).toBe('Back to Home');
+ });
+
+ test('pt-BR loading text matches expected value', () => {
+ expect(commonTranslations['pt-BR'].loading).toBe('Carregando...');
+ });
+
+ test('en-US loading text matches expected value', () => {
+ expect(commonTranslations['en-US'].loading).toBe('Loading...');
+ });
+
+ test('translations can be used with fallback pattern', () => {
+ const lang = 'en-US';
+ const tc = commonTranslations[lang] ?? commonTranslations['en-US'];
+ expect(tc.error).toBe('Error');
+ });
+
+ test('unknown locale falls back to en-US via nullish coalescing', () => {
+ const lang = 'fr-FR';
+ const tc = commonTranslations[lang] ?? commonTranslations['en-US'];
+ expect(tc).toEqual(commonTranslations['en-US']);
+ });
+});
diff --git a/tests/frontend/unit/useStats.test.js b/tests/frontend/unit/useStats.test.js
index 54f256e..683ae07 100644
--- a/tests/frontend/unit/useStats.test.js
+++ b/tests/frontend/unit/useStats.test.js
@@ -96,4 +96,20 @@ describe('useStats', () => {
expect.objectContaining({ method: 'GET' })
);
});
+
+ test('falls back gracefully when AbortSignal.timeout is unavailable', async () => {
+ const originalTimeout = AbortSignal.timeout;
+ // Simulate environments where AbortSignal.timeout does not exist
+ delete AbortSignal.timeout;
+ try {
+ jest.spyOn(global, 'fetch').mockResolvedValue({
+ ok: true,
+ json: async () => MOCK_RESPONSE,
+ });
+ const { result } = renderHook(() => useStats());
+ await waitFor(() => expect(result.current.tests).toBe('971+'));
+ } finally {
+ AbortSignal.timeout = originalTimeout;
+ }
+ });
});
From 0e9b7c865f28d12185a2caae7b911f98dae999c7 Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 09:50:21 -0300
Subject: [PATCH 12/17] add docs to msr
---
.../mrs_oliveira2025/all_posts_mined.csv | 4091 +++++++++++++++++
.../cleaned_all_posts_mined.csv | 761 +++
..._posts_with_test_tools_and_methods (1).csv | 71 +
3 files changed, 4923 insertions(+)
create mode 100644 docs_to_import/mrs_oliveira2025/all_posts_mined.csv
create mode 100644 docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv
create mode 100644 docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv
diff --git a/docs_to_import/mrs_oliveira2025/all_posts_mined.csv b/docs_to_import/mrs_oliveira2025/all_posts_mined.csv
new file mode 100644
index 0000000..d717b18
--- /dev/null
+++ b/docs_to_import/mrs_oliveira2025/all_posts_mined.csv
@@ -0,0 +1,4091 @@
+Link
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
diff --git a/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv b/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv
new file mode 100644
index 0000000..7b12d1d
--- /dev/null
+++ b/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv
@@ -0,0 +1,761 @@
+Link
+https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g
+https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n
+https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm
+https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4
+https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp
+https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22
+https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63
+https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk
+https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730
+https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j
+https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo
+https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb
+https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd
+https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl
+https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a
+https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp
+https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c
+https://dev.to/m1pko/data-quality-technical-debt-from-hell
+https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i
+https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8
+https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47
+https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf
+https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag
+https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic
+https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh
+https://dev.to/namnguyen
+https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5
+https://dev.to/codexam/why-is-big-data-important-40ha
+https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533
+https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j
+https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo
+https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob
+https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52
+https://dev.to/jeremystan/airbnb-quality-data-for-all-280f
+https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43
+https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top
+https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908
+https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km
+https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e
+https://dev.to/daryashirokova
+https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4
+https://dev.to/reneebetina
+https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363
+https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a
+https://dev.to/apssouza22/tech-lead-playbook-523
+https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56
+https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm
+https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest
+https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm
+https://dev.to/dataform
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii
+https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce
+https://dev.to/berthaw82414312
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi
+https://dev.to/tinybirdco
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm
+https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7
+https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i
+https://dev.to/andyb1979/android-chart-performance-comparison-5ej7
+https://dev.to/habereder/comment/po6j
+https://dev.to/bytebodger/litmus-tests-in-tech-1ll7
+https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp
+https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf
+https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest
+https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e
+https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62
+https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi
+https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo
+https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1
+https://dev.to/t/testing/page/73
+https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd
+https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm
+https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49
+https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p
+https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk
+https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage
+https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection
+https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data
+https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo
+https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data
+https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process
+https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data
+https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python
+https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data
+https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut
+https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow
+https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r
+https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data
+https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other
+https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter
+https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w
+https://stackoverflow.com/questions/64961961/shared-array-for-big-data
+https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu
+https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i
+https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list
+https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels
+https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming
+https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk
+https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year
+https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution
+https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget
+https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data
+https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data
+https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes
+https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets
+https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server
+https://stackoverflow.com/questions/64014590/application-insights-with-big-data
+https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but
+https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high
+https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data
+https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop
+https://stackoverflow.com/questions/61221081/random-forest-for-big-data
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler
+https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base
+https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data
+https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data
+https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations
+https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core
+https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data
+https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view
+https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data
+https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame
+https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0
+https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded
+https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse
+https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data
+https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster
+https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data
+https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data
+https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql
+https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary
+https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design
+https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas
+https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization
+https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation
+https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data
+https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file
+https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling
+https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python
+https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python
+https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c
+https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index
+https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql
+https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo
+https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data
+https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group
+https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse
+https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data
+https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql
+https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel
+https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r
+https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python
+https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data
+https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data
+https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data
+https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient
+https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python
+https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny
+https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data
+https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data
+https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into
+https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d
+https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists
+https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set
+https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python
+https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data
+https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga
+https://stackoverflow.com/questions/60384558/big-data-conditional-agregration
+https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb
+https://stackoverflow.com/questions/60306007/python-big-data-regression
+https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net
+https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview
+https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets
+https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic
+https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data
+https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file
+https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python
+https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk
+https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data
+https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services
+https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects
+https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster
+https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining
+https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v
+https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in
+https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data
+https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc
+https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native
+https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat
+https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r
+https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r
+https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds
+https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form
+https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c
+https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data
+https://stackoverflow.com/questions/69758458/big-data-structure
+https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark
+https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for
+https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data
+https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native
+https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user
+https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps
+https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in
+https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel
+https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time
+https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl
+https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl
+https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python
+https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds
+https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data
+https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown
+https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data
+https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages
+https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data
+https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram
+https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra
+https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data
+https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing
+https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data
+https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post
+https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file
+https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql
+https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql
+https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage
+https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data
+https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api
+https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values
+https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data
+https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system
+https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark
+https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data
+https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t
+https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r
+https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql
+https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data
+https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak
+https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed
+https://stackoverflow.com/questions/66744410/laravel-delete-big-data
+https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c
+https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql
+https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql
+https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s
+https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice
+https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data
+https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members
+https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api
+https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle
+https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark
+https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data
+https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks
+https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set
+https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta
+https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files
+https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category
+https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose
+https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out
+https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas
+https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce
+https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection
+https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas
+https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript
+https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db
+https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09
+https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485
+https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e
+https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf
+https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON
+https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948
+https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259
+https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb
+https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201
+https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e
+https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2
+https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63
+https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e
+https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9
+https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9
+https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d
+https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7
+https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab
+https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3
+https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390
+https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b
+https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b
+https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c
+https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364
+https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053
+https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5
+https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259
+https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8
+https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f
+https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0
+https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7
+https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570
+https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b
+https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b
+https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0
+https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84
+https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5
+https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d
+https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e
+https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4
+https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f
+https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510
+https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d
+https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa
+https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6
+https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b
+https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff
+https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e
+https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b
+https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6
+https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e
+https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17
+https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b
+https://medium.com/@Dima/big-data-checklist-1b8e3214f96
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22
+https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e
+https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165
+https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee
+https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69
+https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615
+https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c
+https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2
+https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246
+https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3
+https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494
+https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127
+https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9
+https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a
+https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867
+https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf
+https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7
+https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83
+https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187
+https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1
+https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08
+https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946
+https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973
+https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3
+https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143
+https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082
+https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76
+https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618
+https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67
+https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93
+https://informationit27.medium.com/explain-big-data-testing-b555517f9902
+https://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be
+https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql
+https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck
+https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet
+https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output
+https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory
+https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files
+https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data
+https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash
+https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id
+https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data
+https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data
+https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec
+https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js
+https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows
+https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python
+https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb
+https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data
+https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t
+https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating
+https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data
+https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss
+https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api
+https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data
+https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt
+https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set
+https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data
+https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches
+https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark
+https://stackoverflow.com/questions/76104308/randomforest-for-big-data
+https://stackoverflow.com/questions/76103457/variable-selection-in-big-data
+https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox
+https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases
+https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server
+https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame
+https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set
+https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable
+https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame
+https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls
+https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template
+https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter
+https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data
+https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r
+https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb
+https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files
+https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data
+https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch
+https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug
+https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin
+https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python
+https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once
+https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts
+https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an
+https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue
+https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors
+https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data
+https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python
+https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark
+https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file
+https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data
+https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data
+https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob
+https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w
+https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data
+https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit
+https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time
+https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage
+https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common
+https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data
+https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle
+https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance
+https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python
+https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data
+https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an
+https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data
+https://stackoverflow.com/questions/73274450/big-data-in-tableview
+https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference
+https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin
+https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql
+https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data
+https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c
+https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set
+https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery
+https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries
+https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way
+https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file
+https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix
+https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java
+https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python
+https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys
+https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu
+https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data
+https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy
+https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame
+https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames
+https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements
+https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data
+https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise
+https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values
+https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines
+https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g
+https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns
+https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data
+https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object
+https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data
+https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data
+https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python
+https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error
+https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar
+https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor
+https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu
+https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data
+https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3
+https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native
+https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of
+https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment
+https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca
+https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data
+https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks
+https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves
+https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data
+https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps
+https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality
+https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB
+https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl
+https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality
+https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK
+https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan
+https://www.linkedin.com/pulse/big-data-testing-qa-touch
+https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir
+https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7
+https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory
+https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen
+https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle
+https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran
+https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/
+https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow
+https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf
+https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e
+https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc
+https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay
+https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov
+https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc
+https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB
+https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-
+https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post
+https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing
+https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering
+https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair
+https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy
+https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson
+https://www.linkedin.com/pulse/testing-big-data-gagan-mehra
+https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment
+https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment
+https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw
+https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f
+https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport
+https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami
+https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin
+https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR
+https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management
+https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking
+https://www.linkedin.com/pulse/data-quality-testing-grant-brodie
+https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308
+https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla
+https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan
+https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta
+https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter
+https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov
+https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa
+https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc
+https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality
+https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca
+https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri
+https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye
+https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner
+https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5
+https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf
+https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card
+https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki
+https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics
+https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az
+https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc
+https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci
+https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria
+https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier
+https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc
+https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin
+https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik
+https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha
+https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot
+https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325
+https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953
+https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj
+https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view
+https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality
+https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f
+https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r
+https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data
+https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports
+https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data
+https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis
+https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data
+https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino
+https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c
+https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data
+https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust
+https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index
+https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data
+https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r
+https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt
+https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a
+https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back
+https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am
+https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b
+https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table
+https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data
+https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func
+https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter
+https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data
+https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data
+https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse
+https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string
+https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages
+https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment
+https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data
+https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way
+https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov
+https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data
+https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data
+https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame
+https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications
+https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column
+https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data
+https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda
+https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools
+https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv
+https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data
+https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data
+https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado
+https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data
+https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored
+https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl
+https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models
+https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel
+https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client
+https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand
+https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same
+https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data
+https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift
+https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data
+https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram
+https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data
+https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case
+https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods
+https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f
+https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh
+https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set
+https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications
+https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data
+https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text
+https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data
+https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data
+https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz
+https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize
+https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se
+https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
+https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file
+https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set
+https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame
+https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace
+https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data
+https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel
+https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data
+https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing
+https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi
+https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and
+https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage
+https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise
+https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data
+https://stackoverflow.com/questions/44502825/performance-testing-on-big-data
+https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive
+https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as
+https://stackoverflow.com/questions/31162894/how-to-create-big-data-project
+https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different
+https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr
+https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications
+https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c
+https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file
+https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri
+https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern
+https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing
+https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system
+https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products
+https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data
+https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data
+https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data
+https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data
+https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms
+https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api
+https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job
+https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil
+https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift
+https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented
+https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing
+https://stackoverflow.com/questions/48373636/big-data-in-datalab
+https://stackoverflow.com/questions/58725538/do-we-visualize-big-data
+https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don
+https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
+https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand
+https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error
+https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository
+https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas
+https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas
+https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database
+https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data
+https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php
+https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial
+https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files
+https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce
+https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b
+https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key
+https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data
+https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices
+https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark
+https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow
+https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest
+https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db
+https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e
+https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data
+https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data
+https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana
+https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data
+https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data
+https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction
+https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data
+https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi
+https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app
+https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second
+https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db
+https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting
+https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data
+https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments
+https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures
+https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed
+https://sqa.stackexchange.com/questions/37718/big-data-application-testing
diff --git a/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv b/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv
new file mode 100644
index 0000000..6c44a2e
--- /dev/null
+++ b/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv
@@ -0,0 +1,71 @@
+link,ferramentas,metodo
+https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp,"JUnit, JUnit 5, JUnit, Jest",Integration Testing
+https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22,,Exploratory Testing
+https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730,Selenium,
+https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo,,Test-Driven Development
+https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi,Selenium,
+https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl,,Regression Testing
+https://dev.to/sudo_pradip/dbt-and-software-engineering-4006,,"Regression Testing, Unit Testing, Acceptance Testing"
+https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a,Jest,"Behavior-Driven Development, Integration Testing, Load Testing"
+https://dev.to/m1pko/data-quality-technical-debt-from-hell,,Regression Testing
+https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8,Cucumber,Test-Driven Development
+https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf,"Selenium, Appium",Regression Testing
+https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i,"Mockito, Jest","Unit Testing, Integration Testing"
+https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa,Selenium,
+https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363,"JUnit, JUnit",
+https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja,,Regression Testing
+https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin,"Selenium, Cucumber, Appium","Regression Testing, Unit Testing, Integration Testing"
+https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c,,Smoke Testing
+https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii,,"Unit Testing, Integration Testing"
+https://dev.to/berthaw82414312,"Selenium, Appium","Test-Driven Development, Exploratory Testing, Regression Testing, Unit Testing, Integration Testing"
+https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi,,"Regression Testing, Load Testing"
+https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm,,"Regression Testing, Acceptance Testing, Load Testing"
+https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7,,"Regression Testing, Unit Testing"
+https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i,Selenium,
+https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf,,"Unit Testing, Integration Testing"
+https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p,"Selenium, Appium",
+https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j,"JUnit, JUnit","Test-Driven Development, Unit Testing"
+https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e,"Selenium, TestNG, Appium, Jest","Exploratory Testing, Regression Testing"
+https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db,Selenium,
+https://dev.to/meghasharmaaaa/devops-toolchain-mlo,"JUnit, Selenium, TestNG, JUnit",
+https://dev.to/t/testing/page/73,"Selenium, Postman, Jest","Regression Testing, Integration Testing"
+https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm,Selenium,
+https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter,,Load Testing
+https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler,,Load Testing
+https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data,,Load Testing
+https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db,,Unit Testing
+https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON,Cucumber,Unit Testing
+https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63,,Load Testing
+https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9,,Unit Testing
+https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c,,Unit Testing
+https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff,,Unit Testing
+https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b,,Regression Testing
+https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22,,"Unit Testing, Integration Testing, Acceptance Testing"
+https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e,,Regression Testing
+https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37,,Integration Testing
+https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69,"JUnit, JUnit",
+https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c,,"Unit Testing, Integration Testing"
+https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143,,"Regression Testing, Integration Testing"
+https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76,"JUnit, JUnit",Unit Testing
+https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67,,Smoke Testing
+https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality,Selenium,
+https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory,Selenium,
+https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects,"JUnit, Selenium, TestNG, Cucumber, JUnit","Test-Driven Development, Behavior-Driven Development, Regression Testing, Unit Testing, Integration Testing, Acceptance Testing, Smoke Testing, Load Testing"
+https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle,,"Regression Testing, Integration Testing, Load Testing"
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e,,"Acceptance Testing, Load Testing"
+https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your,,"Regression Testing, Unit Testing, Integration Testing"
+https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov,Selenium,Test-Driven Development
+https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing,,"Test-Driven Development, Unit Testing, Integration Testing"
+https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-,,"Test-Driven Development, Exploratory Testing, Unit Testing"
+https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair,Selenium,
+https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy,,"Unit Testing, Integration Testing"
+https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment,,"Unit Testing, Integration Testing, Acceptance Testing"
+https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f,"Selenium, Cucumber, Appium",
+https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e,,Regression Testing
+https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory,,Acceptance Testing
+https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z,,Smoke Testing
+https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla,,Unit Testing
+https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri,"Selenium, TestNG",
+https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye,Selenium,
+https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki,"Selenium, Appium",
+https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view,,Exploratory Testing
From d994ff277f30185628d65b0ac2bdf0fc78464f4d Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 09:50:50 -0300
Subject: [PATCH 13/17] add docs all rsls
---
.../Advancing beyond technicism-2022.pdf | Bin
...n enhanced grey wolf optimizer boosted.pdf | Bin
...rge scale production of satellite 2022.pdf | Bin
...essing business value of Big Data 2017.pdf | Bin
.../BIGOWL2019.pdf | Bin
.../Big data analytics 2022.pdf | Bin
...k_Environment_on_Resizing_Iris_Dataset.pdf | Bin
...tigating the adoption of big data 2019.pdf | Bin
.../Performance in Distributed Big Data.pdf | Bin
...ity Assurance for Big Data Application.pdf | Bin
...n read modeling approach as a basis of.pdf | Bin
...ing of Big Data Analytics with Complex.pdf | Bin
.../alexandrov2013.pdf | Bin
.../chen2018.pdf | Bin
.../demirbaga2022.pdf | Bin
.../ghazal2013.pdf | Bin
.../gulzar2018.pdf | Bin
.../peng2020.pdf | Bin
.../prom-on2014.pdf | Bin
.../rabl2015.pdf | Bin
.../shapira2016.pdf | Bin
.../skracic2017.pdf | Bin
.../staegemann2019.pdf | Bin
.../xia2019.pdf | Bin
.../zhang2017.pdf | Bin
.../zhang2018.pdf | Bin
.../zhang2019.pdf | Bin
.../zheng2017.pdf | Bin
...le Approaches for Test Suite Reduction.txt | 160 ++++++++++++++
...uality Assurance in Big Data Analytics.txt | 105 +++++++++
...ility on big data open source software.txt | 114 ++++++++++
...twarePerspectives_Issues_and_Practices.txt | 180 ++++++++++++++++
..._processing_with_Big_Data_technologies.txt | 88 ++++++++
...e for IoT-based Smart Applications 0.0.txt | 178 +++++++++++++++
...ream Processing Framework Architecture.txt | 202 +++++++++++++++++
...ght-Load Embedded Performance Modeling.txt | 115 ++++++++++
... Management for Big Data Applications.txt | 198 +++++++++++++++++
..._Data_Analysis_for_Industrial_Internet.txt | 109 ++++++++++
...od for Big Data Analytics Applications.txt | 151 +++++++++++++
...structured_Usage_for_Big_Data_Platform.txt | Bin 0 -> 25197 bytes
...-Developing-Fault-Tolerant-High-Loaded.txt | 170 +++++++++++++++
...Application of Test Driven Development.txt | 194 +++++++++++++++++
...ms_in_e-Science_and_e-Commerce_Domains.txt | Bin 0 -> 31827 bytes
...est Driven Development in the Big Data.txt | 197 +++++++++++++++++
...LIGHT TEST DATA FOR BIG DATA COMPUTING.txt | 127 +++++++++++
...sed-Testing-Characteristics-Challenges.txt | 176 +++++++++++++++
...PIPE DryRunner An approach for testing.txt | 131 +++++++++++
... testing analysis of big data products.txt | 58 +++++
...the_Quality_Model_of_Big_Data_Software.txt | 141 ++++++++++++
...echanism_of_Financial_Market_Resource_.txt | 196 +++++++++++++++++
...radiation oncology information systems.txt | 188 ++++++++++++++++
...pReduce program using Induction Method.txt | 158 ++++++++++++++
...Analytics Using Framework Abstraction.txt | 148 +++++++++++++
..._Effect_Analysis_and_another_Methodolo.txt | 108 ++++++++++
... Era of Big Data, IoT and Smart Cities.txt | 186 ++++++++++++++++
...adsoftwareforsatellitebigdataingestion.txt | 115 ++++++++++
...ning of paediatric anthropometric data.txt | 107 +++++++++
...rmanceModelingforBigDataEcosystems2020.txt | 108 ++++++++++
...ta Preprocessing of Large NGS Datasets.txt | 160 ++++++++++++++
...ramework_for_Quality_Assurance_and_Val.txt | 125 +++++++++++
...ng Technique on a Big Data Environment.txt | 203 ++++++++++++++++++
... Agricultural Meteorological Disasters.txt | 174 +++++++++++++++
62 files changed, 4770 insertions(+)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Advancing beyond technicism-2022.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/An enhanced grey wolf optimizer boosted.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/An industry 4.0 approach to large scale production of satellite 2022.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Assessing business value of Big Data 2017.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/BIGOWL2019.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Big data analytics 2022.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Investigating the adoption of big data 2019.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Performance in Distributed Big Data.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Quality Assurance for Big Data Application.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/Schema on read modeling approach as a basis of.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/White-Box Testing of Big Data Analytics with Complex.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/alexandrov2013.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/chen2018.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/demirbaga2022.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/ghazal2013.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/gulzar2018.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/peng2020.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/prom-on2014.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/rabl2015.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/shapira2016.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/skracic2017.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/staegemann2019.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/xia2019.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/zhang2017.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/zhang2018.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/zhang2019.pdf (100%)
rename docs_to_import/{RSL-Daase2024 => rsl_daase2024}/zheng2017.pdf (100%)
create mode 100644 docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt
create mode 100644 docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt
create mode 100644 docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt
create mode 100644 docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt
create mode 100644 docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt
create mode 100644 docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt
create mode 100644 docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt
create mode 100644 docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt
create mode 100644 docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt
create mode 100644 docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt
create mode 100644 docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt
create mode 100644 docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt
create mode 100644 docs_to_import/rsl_oliveira2024/25-Problem-of-Developing-Fault-Tolerant-High-Loaded.txt
create mode 100644 docs_to_import/rsl_oliveira2024/27-Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development.txt
create mode 100644 docs_to_import/rsl_oliveira2024/3 - Big_Data_Testing_Framework_for_Recommendation_Systems_in_e-Science_and_e-Commerce_Domains.txt
create mode 100644 docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt
create mode 100644 docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt
create mode 100644 docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt
create mode 100644 docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt
create mode 100644 docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt
create mode 100644 docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt
create mode 100644 docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt
create mode 100644 docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt
create mode 100644 docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt
create mode 100644 docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt
create mode 100644 docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt
create mode 100644 docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt
create mode 100644 docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt
create mode 100644 docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt
create mode 100644 docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt
create mode 100644 docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt
create mode 100644 docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt
create mode 100644 docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt
create mode 100644 docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt
diff --git a/docs_to_import/RSL-Daase2024/Advancing beyond technicism-2022.pdf b/docs_to_import/rsl_daase2024/Advancing beyond technicism-2022.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Advancing beyond technicism-2022.pdf
rename to docs_to_import/rsl_daase2024/Advancing beyond technicism-2022.pdf
diff --git a/docs_to_import/RSL-Daase2024/An enhanced grey wolf optimizer boosted.pdf b/docs_to_import/rsl_daase2024/An enhanced grey wolf optimizer boosted.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/An enhanced grey wolf optimizer boosted.pdf
rename to docs_to_import/rsl_daase2024/An enhanced grey wolf optimizer boosted.pdf
diff --git a/docs_to_import/RSL-Daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf b/docs_to_import/rsl_daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf
rename to docs_to_import/rsl_daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf
diff --git a/docs_to_import/RSL-Daase2024/Assessing business value of Big Data 2017.pdf b/docs_to_import/rsl_daase2024/Assessing business value of Big Data 2017.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Assessing business value of Big Data 2017.pdf
rename to docs_to_import/rsl_daase2024/Assessing business value of Big Data 2017.pdf
diff --git a/docs_to_import/RSL-Daase2024/BIGOWL2019.pdf b/docs_to_import/rsl_daase2024/BIGOWL2019.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/BIGOWL2019.pdf
rename to docs_to_import/rsl_daase2024/BIGOWL2019.pdf
diff --git a/docs_to_import/RSL-Daase2024/Big data analytics 2022.pdf b/docs_to_import/rsl_daase2024/Big data analytics 2022.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Big data analytics 2022.pdf
rename to docs_to_import/rsl_daase2024/Big data analytics 2022.pdf
diff --git a/docs_to_import/RSL-Daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf b/docs_to_import/rsl_daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf
rename to docs_to_import/rsl_daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf
diff --git a/docs_to_import/RSL-Daase2024/Investigating the adoption of big data 2019.pdf b/docs_to_import/rsl_daase2024/Investigating the adoption of big data 2019.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Investigating the adoption of big data 2019.pdf
rename to docs_to_import/rsl_daase2024/Investigating the adoption of big data 2019.pdf
diff --git a/docs_to_import/RSL-Daase2024/Performance in Distributed Big Data.pdf b/docs_to_import/rsl_daase2024/Performance in Distributed Big Data.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Performance in Distributed Big Data.pdf
rename to docs_to_import/rsl_daase2024/Performance in Distributed Big Data.pdf
diff --git a/docs_to_import/RSL-Daase2024/Quality Assurance for Big Data Application.pdf b/docs_to_import/rsl_daase2024/Quality Assurance for Big Data Application.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Quality Assurance for Big Data Application.pdf
rename to docs_to_import/rsl_daase2024/Quality Assurance for Big Data Application.pdf
diff --git a/docs_to_import/RSL-Daase2024/Schema on read modeling approach as a basis of.pdf b/docs_to_import/rsl_daase2024/Schema on read modeling approach as a basis of.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/Schema on read modeling approach as a basis of.pdf
rename to docs_to_import/rsl_daase2024/Schema on read modeling approach as a basis of.pdf
diff --git a/docs_to_import/RSL-Daase2024/White-Box Testing of Big Data Analytics with Complex.pdf b/docs_to_import/rsl_daase2024/White-Box Testing of Big Data Analytics with Complex.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/White-Box Testing of Big Data Analytics with Complex.pdf
rename to docs_to_import/rsl_daase2024/White-Box Testing of Big Data Analytics with Complex.pdf
diff --git a/docs_to_import/RSL-Daase2024/alexandrov2013.pdf b/docs_to_import/rsl_daase2024/alexandrov2013.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/alexandrov2013.pdf
rename to docs_to_import/rsl_daase2024/alexandrov2013.pdf
diff --git a/docs_to_import/RSL-Daase2024/chen2018.pdf b/docs_to_import/rsl_daase2024/chen2018.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/chen2018.pdf
rename to docs_to_import/rsl_daase2024/chen2018.pdf
diff --git a/docs_to_import/RSL-Daase2024/demirbaga2022.pdf b/docs_to_import/rsl_daase2024/demirbaga2022.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/demirbaga2022.pdf
rename to docs_to_import/rsl_daase2024/demirbaga2022.pdf
diff --git a/docs_to_import/RSL-Daase2024/ghazal2013.pdf b/docs_to_import/rsl_daase2024/ghazal2013.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/ghazal2013.pdf
rename to docs_to_import/rsl_daase2024/ghazal2013.pdf
diff --git a/docs_to_import/RSL-Daase2024/gulzar2018.pdf b/docs_to_import/rsl_daase2024/gulzar2018.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/gulzar2018.pdf
rename to docs_to_import/rsl_daase2024/gulzar2018.pdf
diff --git a/docs_to_import/RSL-Daase2024/peng2020.pdf b/docs_to_import/rsl_daase2024/peng2020.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/peng2020.pdf
rename to docs_to_import/rsl_daase2024/peng2020.pdf
diff --git a/docs_to_import/RSL-Daase2024/prom-on2014.pdf b/docs_to_import/rsl_daase2024/prom-on2014.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/prom-on2014.pdf
rename to docs_to_import/rsl_daase2024/prom-on2014.pdf
diff --git a/docs_to_import/RSL-Daase2024/rabl2015.pdf b/docs_to_import/rsl_daase2024/rabl2015.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/rabl2015.pdf
rename to docs_to_import/rsl_daase2024/rabl2015.pdf
diff --git a/docs_to_import/RSL-Daase2024/shapira2016.pdf b/docs_to_import/rsl_daase2024/shapira2016.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/shapira2016.pdf
rename to docs_to_import/rsl_daase2024/shapira2016.pdf
diff --git a/docs_to_import/RSL-Daase2024/skracic2017.pdf b/docs_to_import/rsl_daase2024/skracic2017.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/skracic2017.pdf
rename to docs_to_import/rsl_daase2024/skracic2017.pdf
diff --git a/docs_to_import/RSL-Daase2024/staegemann2019.pdf b/docs_to_import/rsl_daase2024/staegemann2019.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/staegemann2019.pdf
rename to docs_to_import/rsl_daase2024/staegemann2019.pdf
diff --git a/docs_to_import/RSL-Daase2024/xia2019.pdf b/docs_to_import/rsl_daase2024/xia2019.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/xia2019.pdf
rename to docs_to_import/rsl_daase2024/xia2019.pdf
diff --git a/docs_to_import/RSL-Daase2024/zhang2017.pdf b/docs_to_import/rsl_daase2024/zhang2017.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/zhang2017.pdf
rename to docs_to_import/rsl_daase2024/zhang2017.pdf
diff --git a/docs_to_import/RSL-Daase2024/zhang2018.pdf b/docs_to_import/rsl_daase2024/zhang2018.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/zhang2018.pdf
rename to docs_to_import/rsl_daase2024/zhang2018.pdf
diff --git a/docs_to_import/RSL-Daase2024/zhang2019.pdf b/docs_to_import/rsl_daase2024/zhang2019.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/zhang2019.pdf
rename to docs_to_import/rsl_daase2024/zhang2019.pdf
diff --git a/docs_to_import/RSL-Daase2024/zheng2017.pdf b/docs_to_import/rsl_daase2024/zheng2017.pdf
similarity index 100%
rename from docs_to_import/RSL-Daase2024/zheng2017.pdf
rename to docs_to_import/rsl_daase2024/zheng2017.pdf
diff --git a/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt b/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt
new file mode 100644
index 0000000..a20cfc0
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt
@@ -0,0 +1,160 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2019 IEEE/ACM 41st International Conference on Software Engineering (ICSE)
+Scalable Approaches for Test Suite Reduction
+Emilio Cruciani∗, Breno Miranda†§, Roberto Verdecchia∗‡, and Antonia Bertolino§
+∗Gran Sasso Science Institute | L’Aquila, Italy
+†Federal University of Pernambuco | Recife, Brazil
+‡Vrije Universiteit Amsterdam | Amsterdam, The Netherlands
+§ISTI – Consiglio Nazionale delle Ricerche | Pisa, Italy
+∗emilio.cruciani@gssi.it | †bafm@cin.ufpe.br | ‡roberto.verdecchia@gssi.it | §antonia.bertolino@isti.cnr.it
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract—Test suite reduction approaches aim at decreasing software regression testing costs by selecting a representative subset from large-size test suites. Most existing techniques are too expensive for handling modern massive systems and moreover depend on artifacts, such as code coverage metrics or specification models, that are not commonly available at large scale. We present a family of novel very efficient approaches for similarity- based test suite reduction that apply algorithms borrowed from
+the big data domain together with smart heuristics for finding
+an evenly spread subset of test cases. The approaches are very general since they only use as input the test cases themselves (test source code or command line input). We evaluate four approaches
+in a version that selects a fixed budget B of test cases, and also in an adequate version that does the reduction guaranteeing some fixed coverage. The results show that the approaches yield a fault detection loss comparable to state-of-the-art techniques, while providing huge gains in terms of efficiency. When applied to a suite of more than 500K real world test cases, the most efficient of the four approaches could select B test cases (for varying B values) in less than 10 seconds.
+Index Terms—Clustering, Random projection, Similarity- based testing, Software testing, Test suite reduction.
+I. INTRODUCTION
+In recent years testing has consistently been the most ac- tively investigated topic of main software engineering confer- ences [6]. One prominent problem in software testing research can be abstracted as: Given a software S and an associated test suite T, how can we efficientlyverify whether S passes on T, or -if not- identify the failing test cases? In this formulation, the emphasis is on the term “efficiently”: Otherwise, the easy solution would be to just execute S on T. The research targets the common practical case that along the development process S needs to be repeatedly tested on T (see, e.g., [15]) and the plain retest-all strategy may be too costly considering the available resources (e.g., time).
+To address the above question, in the last three decades many techniques have been proposed, which can be roughly divided in two groups: those that aim at reordering the test cases in T so that those more likely to fail are executed first (test case prioritization), and those that select a subset T ⊆ T that should ideally include the failing test cases, if any; the latter group of techniques is referred to as test case selection or test suite reduction,1 depending on whether when choosing
+1Some authors use the term minimization in place of reduction when the not selected test cases are permanently removed from the test suite. Here, in line with [34], we will consider the two terms as interchangeable.
+1558-1225/19/$31.00 ©2019 IEEE DOI 10.1109/ICSE.2019.00055
+T the changes made to S are considered (modification-aware regression testing) or not [34].
+The proposed techniques have been evaluated and compared against each other using metrics relative to their fault detection effectiveness (e.g., the Average Percentage of Fault Detection of the reordered test suite, or the loss in faults detected by the reduced test suite T ); for test reduction and selection, also metrics relative to cost savings, e.g., the size or the execution time of T are compared against those of the full suite T.
+Another important factor that should be taken into account is the cost of the technique itself, both in terms of the compu- tational effort and of the resources it requires. In other words, when evaluating whether investing on an automated approach aimed at reducing the cost of testing is worth, a complete cost- benefit analysis should also include the overheads implied by the approach [18].
+However, not many of the proposed techniques have consid- ered such implied costs. In 2004, Orso and coauthors already noticed that in regression testing efficiency and precision need to be traded off, because “precise techniques are generally too expensive to be used on large systems” [29]. Gligoric and coauthors [16] were the first to observe that the time consumed by any regression test technique should include an analysis phase, an execution phase, and a collection phase. They noticed that most authors only considered the savings in execution, a few measured also the analysis time, but no one before them measured also the last phase in which the information needed to apply the technique is collected. As pointed out by Elbaum and coauthors [15], at scale industries need approaches “that are relatively inexpensive and do not rely on code coverage information”. In fact, for white-box techniques, the cost of collecting and saving up-to-date code coverage information should also be considered as part of the collection phase. This is confirmed by Herzig [19], who observes that code coverage is not for free as assumed in many works, and can cause up to 30% of time overhead!
+In a recent work [28], we addressed the prioritization of very large test suites and showed that as the size of the test suite grows, most existing approaches become soon not applicable. That work proposed the FAST family of similarity-based test prioritization approaches that outperformed in efficiency and scalability all the compared approaches, except for the white- box greedy total approach. If we count the often ignored
+419
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+costs of measuring coverage, then FAST appears as the only scalable prioritization approach.
+This paper introduces a family of scalable approaches for test suite reduction, called the FAST-R family. As in [28], FAST-R approaches are similarity-based and borrow tech- niques from the big data domain. However, with respect to [28] we apply here several new techniques that allow us to achieve even more efficient results. In FAST we used minhashing and locality-sensitive hashing algorithms [25]. FAST-R approaches adopt other efficient heuristics that are used to derive a set of B evenly spread points in a big data space. Precisely, one approach called FAST++ applies the k-means++ algorithm [4], while another one called FAST-CS uses a recent importance sampling algorithm to construct coresets, a clustering technique that scales up to massive datasets [5]. Moreover, we further enhance the scalability of both approaches by applying the random projection technique, that reduces the space dimensionality while preserving the pairwise distances of the points [21].
+FAST++ and FAST-CS are extremely “practical” techniques in the sense required by all of [15], [16], [19], [28]: i) thanks to the heuristics imported from the big data domain they are computationally very efficient; ii) to reduce a test suite T they require no other information beyond T itself.
+Based on the applied algorithms, the most natural scenario for FAST++ and FAST-CS is that of finding a fixed budget B of test cases. This is referred in literature as inadequate test suite reduction. In the paper we also show how they can be adapted to perform adequate reduction, i.e., preserving coverage: We apply a filtering strategy and search for the most dissimilar test cases only among the ones that cover not yet covered elements. However we acknowledge that at large scale such adequate scenario is not realistic, because as already said coverage information cannot be assumed.
+Although originally proposed for prioritization, we note that FAST approaches [28] could be easily adapted for test reduc- tion: Instead of ordering the whole test suite, the algorithm is stopped when the budget B (or the desired coverage) is reached. Accordingly, we also include in FAST-R and evaluate the reduction version of FAST-pw and FAST-all (the most precise and the most efficient of the FAST family).
+Summarizing, this paper proposes four test suite reduction approaches (two original ones and two adapted from [28]) that can be applied in two testing scenarios: under a fixed budget or for adequate test suite reduction.
+We evaluated the four proposed approaches on commonly used C and Java benchmark programs against state-of-the- art reduction techniques, obtaining comparable results for effectiveness but notable improvements in efficiency. More interestingly, to validate our claims on the scalability of the approaches, we applied all four of them to the budget reduction of a test suite formed by more than 500K Java test cases collected from GitHub. At such large scale, not considering the preparation time, FAST-pw and FAST++ required several hours to reduce the suite, e.g., ∼37 hours and ∼11 hours respectively for a 10% size, but FAST-all required 25 seconds
+and FAST-CS 9 seconds. Actually, FAST-CS looks as a real breakthrough as it took less than 10 seconds for the reduction independently from the percentage, and needed just 5 minutes for preparation in contrast to more than 3 hours taken by FAST-all.
+The original contributions of this work include:
+• The FAST-R family of scalable approaches for inade- quate test suite reduction.
+• A variant of all the approaches for adequate test suite reduction.
+• A large-scale experimentation for evaluating the effi- ciency and effectiveness of the approaches in three sce- narios, including a very large-scale test suite.
+• An open-source automated framework along with all the data used for the experiments to support verifiability.
+The paper is structured as follows. In the next section we survey related work. In Section III we present the approaches used. In Section IV and V, respectively, we present the evalua- tion methodology and the achieved results. Finally, Section VI draws conclusions and hints at future work.
+II. RELATED WORK
+This work is related to software regression testing and more specifically to test suite reduction techniques. The literature on software regression testing is huge: Two surveys [13], [35] provide a broad overview of prioritization, reduction (or minimization, used here in interchangeable way), and selection techniques. In particular, Yoo and Harman [35] reviewed the literature until 2009. Concerning reduction techniques, most of the surveyed works consists of heuristics over white-box coverage criteria, at various level of granularity (including statement, branch, function, or call-stack). Some approaches augment the coverage information with additional inputs by the tester (e.g., weighting coefficients or priority assignments), which may be costly or even biased [35]. Among the few “interesting exceptions” doing black-box reduction, they report some combinatorial, fault-based, and model-based techniques. More recently, Do [13] surveys further advances over [35]. In particular, for test suite reduction she reviews four more recent techniques, two of which are again coverage-based, and two ones introduce specific reduction techniques: one for GUI testing [3], and another for combinatorial interaction testing [7]. Note that both surveys [13], [35] include no work on similarity-based test suite reduction, as we propose here.
+A recent systematic survey by Rehman and coauthors [23] focuses specifically on test suite reduction. The study sur- veyed the literature between 1990 and 2016, identifying a set of 113 relevant primary studies. Based on the adopted algorithms, they classify the approaches into: Greedy (mostly coverage-based), Clustering, and Search-based, plus hybrid combinations thereof. Our approach would fitin the Clustering group, in which out of the surveyed 113 studies they only find three works: one [8] using machine learning algorithms, and two [27], [33] using hierarchical clustering.
+We take here a distance from most of the techniques surveyed in the above studies, since FAST-R is expressly
+motivated by considerations of scalability and practical ap- plicability. In this perspective, our approach is more closely related to few recent works based on coarse-grained heuristics, clustering, and similarity.
+In recent years some collaborative efforts between academic and industrial researchers start to appear that develop coarse- grained approaches trading precision with efficiency/scalabil- ity. Strictly speaking such works focus on test case selec- tion and not test suite reduction, in that the choice of tests to execute is modification-aware. For example, Knauss and coauthors [24] use a statistical model that relates the changed code fragments (or churns) with test outcomes on Ericsson systems; considering a continuous integration development environment, Elbaum and coauthors [15] propose a strategy apt for Google testing process, which combines test case selection during pre-submit testing and test case prioritization in post-submit testing. Both selection and prioritization apply heuristics based on failure history and execution windows. By relying on very efficient algorithms, our FAST-R approaches can scale up to large industrial systems as the above works, while not sacrificing much of precision in deriving a represen- tative subset of the test cases.
+Our similarity-based approach is related to several tech- niques that exploit the diversity among test cases for guiding selection. Some techniques build on the notion of adaptive random testing (ART) [10] that, in a few words, first selects a random set of test cases and then filters them based on their distance from the already selected test cases. Several variants instantiations of ART have been proposed, including ART-D [20] and ART-F [36] that we use as competitors to FAST-R and that are further described in Section IV.
+Some black-box approaches use similarity to reduce model- based test suites. Both test case reduction [2] and test case selection [9], [17] techniques have been proposed. These techniques have been conceived for industrial use: For example Hemmati and coauthors [17] pursue as a main goal a selection of test cases adjusted to the available testing budget. However, all such model-based approaches rely on the assumption that a formal model of program behavior, e.g., a LTS, is available. In contrast, FAST-R does not need to assume anything else beyond the test cases themselves.
+A few works have proposed to leverage clustering of test cases as we do here, e.g., [11], [30]. However they calculate the similarity between two test cases based on code coverage information, which as said already could be too expensive at
+the testing scale we aim.
+III. THE APPROACHES
+Given a test suite T and some fixed budget B ≤ | T|, the goal of similarity-based test suite reduction is to select B evenly spread test cases out of the test suite. If we model each test case as a point in some D-dimensional space, then the problem could be thought of as that of finding the central points of B clusters. The problem of clustering is NP -hard, but we are able to perform scalable similarity-based test suite
+1. Test Suite 3. Random Projection
+t1: grep -e 'foo' file t1 t2: grep -v -e 'foo' file
+t2 t3: grep -F 'bar' file
+t3
+Comp1Comp2Comp3
+2. Vector Space Model (Term Frequency)
+t1 t2
+t3
+grep -e -v -F 'foo''bar' file
+Fig. 1: Visual representation of FAST-R preparation phase.
+reduction by borrowing a technique from the big data domain and using it in combination with some efficient heuristics.
+We consider an Euclidean space, a metric space where the distance between any two points is expressed by the Euclidean distance – what one could think of as the straight line connect- ing them. Let x, y ∈RD be two points; the Euclidean distance
+between them is defined as d(x, y) = i=1 (x i − yi )2.
+D
+In the preparation phase of our approaches (Fig. 1) we transform test cases into points in the Euclidean space via the vector-space model: The textual representation of each test case, e.g., test source code or command line input (Fig. 1.1), is mapped into an n-dimensional point where each dimension corresponds to a different term of the source code and n is equal to the total number of terms used in the whole test suite. The components are weighted according to term-frequency scheme, i.e., the weights are equal to the frequency of the corresponding terms (Fig. 1.2).
+The computation of the Euclidean distance between any two n-dimensional points can be expensive when n is large. To overcome this problem we exploit a dimensionality reduc- tion technique called random projection. Roughly speaking, random projection works because of Johnson-Lindenstrauss Lemma [21], which states that a set of points in a high- dimensional space can be projected into a much lower- dimensional space in a way that pairwise distances are nearly preserved. In particular we use sparse random projection [1], [26], an efficient implementation of the technique that is suitable for database applications (Fig. 1.3).
+We model the clustering problem as a k-means problem, with k = B. Given n points in a metric space, the goal of k- means is to find a k-partition P = {P1,...,P k} of the points that minimizes the sum of the squared Euclidean distances between each point to its closest center of one partition. Formally, the goal is to find argmin k d(x, μ )2,
+i
+P i=1 x ∈P i
+where μ i is the center of the points belonging to partition Pi.
+There exist efficient techniques that are able to find an approximate solution to k-means. One is k-means++ [4],
+Algorithm 1 FAST++
+Input: Test Suite T; Budget B
+Output: Reduced Test Suite R
+1: P ← RandomProjection(T ) Preparation phase 2: s ← FirstSelection(P )
+3: R ← List(s)
+4: D ← Distance() Squared distance to closest point in R 5: D(s) ← 0
+6: while (Size(R) < B) do
+8: for ifalld tP∈(Pt),doP (s) 2 < D (t) then
+7:
+9: D(t) ← d P (t),P (s) 2 Squared Euclidean distance 10: s ← ProportionalSample( P,D)
+11: R ← Append(R,s )
+12: D(s) ← 0
+13: return R
+which achieves an O(log k) approximation ratio2 in expec- tation and finds the centers of the clusters in k linear time iterations. The algorithm is the de facto standard technique for the initialization phase of k-means algorithms. After the initial centers are selected, standard k-means algorithms would iteratively compute the clusters. In our case, to be more efficient, we stop at this stage and use the k selected centers as the test cases of the reduced test suite. The reduction approach that exploits k-means++ as greedy reduction strategy is called FAST++ (Algorithm 1).
+FAST++ starts by preprocessing the test suite T, mapping each test case into a vector according to the vector-space model and then lowering its dimensionality via random projection (Line 1). After the preparation phase, the reduction algorithm works only on the projected data P on which the greedy selection of k-means++ is applied. First, pick the first point uniformly at random3 (Line 2). Then, until B points have not been selected: i) for each projected point t ∈P , compute the squared distance d(t,R)2 between t and its nearest center in R that has been already picked (Lines 7, 8, 9); this can be done incrementally by maintaining the minimum distance and computing only the distance with the last selected point (Lines 8, 9); ii) pick next point s with probability proportional to its distance to R (Line 10).
+Another possible approach to simplify the clustering prob- lem is that of using coresets. Given a set of points S, a coreset is a small subset of S that well approximates the geometric features of S. One usually constructs a coreset first and then finds the centers of the clusters on it, reducing the complexity of the problem while still having theoretical guarantees on the solution. In our case, though, the size of the reduction grows linearly with the size of the test suite making this standard approach less efficient – the complexity of the problem would not lower much. Instead, exploiting a recent extremely efficient algorithm developed for massive datasets [5], we construct a coreset of size B and use it as reduced test suite. The algorithm is based on importance sampling: All points have nonzero
+2In a minimization problem, an α-approximation algorithm finds a solution which is not worse than α times the optimum.
+3Note that this is to stick with k-means++ algorithm, but any other criterion for the choice of the first test case is possible.
+Algorithm 2 FAST-CS
+Input: Test Suite T; Budget B
+Output: Reduced Test Suite R
+1: P ← RandomProjection(T ) Preparation phase 2: μ ← Mean(P )
+3: for all t ∈ P do
+1 d P (t), μ 2
+4: Q(t) ← + Importance sampling
+2|T | t ∈P d P (t ), μ 2
+5: R ← ProportionalSampleWithoutReplacement( P,Q,B )
+6: return R
+probability of being sampled, but points that are far from the center of the dataset (potentially good centers for a clustering) are sampled with higher probability. We call the reduction approach that use this technique FAST-CS (Algorithm 2).
+FAST-CS starts with the preparation phase to compute the set of projected points P (Line 1). Then, it only requires two full passes on P : First it computes the mean of the data points (Line 2) and then it uses it to compute the importance sampling distribution (Lines 3, 4). The probability of each point to be sampled is a linear combination of the uniform distribution (first term in Line 4) and of the distribution which is proportional to the squared Euclidean distance between the data point and the mean of the data (second term in Line 4). Then B points are sampled out of P without replacement with probability proportional to their importance sampling probability (Line 5) and used as reduced test suite.
+Both FAST++ and FAST-CS have also been adapted to be adequate, i.e., to perform a reduction that guarantees some fixed coverage. 4 Getting coverage information of each test case as an extra input, both the proposed approaches are able to reduce the test suite such that some fixed coverage is achieved. This is possible thanks to a filteringphase. In FAST++, all test cases which would not add any extra coverage are filtered out after each selection and the next selection is carried out only among the remaining ones. As for FAST-CS, log|T| test cases are picked at each subsequent iteration and then importance sampling probabilities are recomputed setting to 0 the ones relative to test cases which are filtered out. Picking log|T| tests per iteration instead of just one makes the algorithm scale better to big test suites. Moreover, this choice does not increase the size of the reduced test suite since the selected test cases are still diverse among them and thus the chance of covering different parts of the software under test is still high. Finally, instead of stopping when the reduction reaches size B, both adequate approaches stop whenever the reduction achieves some fixed coverage.
+As said, this work was inspired by the FAST family of test case prioritization approaches [28]: Roughly speaking, those approaches could be also used for the goal of test suite reduction by only picking the first B test cases of the prioritized test suite. To assess also their efficiency and effectiveness when applied to test suite reduction, we modified
+4The pseudocodes of adequate versions are not reported for lack of space, but they can be found online [12].
+all the original algorithms to stop after B test cases are prioritized. Moreover we adapted them to be adequate as well, again using the same filtering phase introduced in FAST++ and FAST-CS.
+IV. EVALUATION METHODOLOGY AND SETUP
+We conducted some experiments to evaluate the effective- ness and the efficiency of the proposed approaches in different application scenarios. As a first scenario we considered the case in which test resources are limited and a tester can only run a small subset of test cases from an existing test suite: We call this the budget scenario, because we fix a priori a reduction percentage of test suite size. In this scenario we can apply the natural version of the proposed approaches. As a second case we considered adequate scenario, in which the code coverage measures of the whole test suite are preserved. To study this scenario, we applied the adequate version of the approaches. We also studied a third case, called the large- scale scenario, in which we apply the inadequate reduction on a very large test suite.
+A. Research Questions
+We address the following research questions (RQs):
+RQ1: How effective are the proposed test suite reduction ap- proaches in comparison with state-of-the-art techniques?
+The goal of test suite reduction is to reduce the size of a test suite while maintaining its fault detection effectiveness. Thus the effectiveness of reduction approaches is commonly measured in terms of the Fault Detection Loss (FDL), and for adequate approaches also in terms of Test Suite Reduction (TSR). Consequently we articulate the above RQ1 into the two following subquestions:
+RQ1.1: [FDL] What is the fault detection loss of the pro-
+posed approaches compared with that of state-of-the-art techniques?
+To answer RQ1.1 we measure: FDL = |F |−|F | , where F is
+|F |
+the set of faults detected by T and F is the set of faults detected by T .
+RQ1.2: [TSR] What is the test suite reduction achieved by
+the proposed approaches compared with that of state-of- the-art techniques?
+To answer RQ1.2 we measure: TSR = |T |−|T| |T | .
+We answer RQ1.1 in both budget and adequate scenarios, and RQ1.2 only in the adequate scenario.
+To evaluate the efficiency we address the following RQ:
+RQ2: How much time is taken by the proposed approaches
+to produce the reduced test suite?
+We measure the time spent in preparation and in reduction. We answer RQ2 in all the three scenarios: In the budget and adequate scenarios we compare the time taken by the proposed approaches against state-of-the-art competitors; in the large- scale scenario we could only apply our proposed techniques, as all competitors approaches require coverage information that at such scales are not available.
+B. Compared reduction approaches
+We recall that the FAST-R family of proposed approaches consists of the newly devised FAST++ and FAST-CS plus the modified reduction versions of FAST-pw and FAST-all, first introduced for prioritization [28].
+The competitor approaches we consider are ART-D [20] and ART-F [36], which belong to the family of Adaptive Random Testing techniques [10]. In brief, they both work by first deriving a candidate set of test cases from those not yet selected that would increase coverage, and then selecting from within the candidate set the most distant test case from those already selected. The two techniques differ on the candidate set size (Dynamically changing in ART-D and Fixed in ART-F) and on the adopted distance metric (Jaccard and Mahattan, respectively). We selected these approaches because they also aim at obtaining an evenly spread set of test cases as in our approaches, and also because in the results reported in [28] they were among the best competitors to FAST. Differently from FAST-R, ART-D and ART-F use coverage measures.
+Finally, we also applied the GA (Greedy Additional) ap- proach [31], which for its simplicity and effectiveness is often considered as a baseline. GA selects the test case that covers the highest number of yet uncovered elements.
+For all three competitors we consider three variants, applied to coverage of function, statement, and branch.
+C. Experiment material
+To evaluate the budget scenario and the adequate scenario we took 5 C and 5 Java programs as experimental subjects. The C programs (consisting of Flex v3, Grep v3, Gzip v1, Sed v6, and Make v1) were gathered from the Software In- frastructure Repository (SIR) [14]. For each of these programs subsequent versions are available, each containing a varying number of seeded faults. In our experiment we considered for each program the version containing the highest number of difficult to reveal faults, i.e., faults that are discovered by less than 50% of the test cases. This was done to avoid including in the experiment “anomalous” versions, e.g., versions in which most faults are revealed by the majority of the test cases or no faults are revealed at all. In total, the C subjects amounted to 52,757 LoC containing 49 faults, and were accompanied by a test suite comprising 2,938 test methods.
+The 5 Java programs taken into account (namely Closure Compiler, Commons Lang, Commons Math, JfreeChart, and Joda-Time) were taken from the Defects4J database [22]. Such database provides a set of programs available in different versions, each containing a single real fault. For our exper- iment, we considered the first version of the programs. In total, the Java Subjects amounted to 320,990 LoC and were accompanied by a test suite comprising 1198 test classes.
+To evaluate the large-scale scenario, we used a set of more than 500K real-world test cases gathered through the GitHub hosting-service. To efficiently collect a high number of heterogeneous test cases, we selected classes committed to the master branches of the available Java repositories, precisely commits adding a single class which adheres to common
+naming conventions for JUnit classes. In total through this process we collected 514,272 test cases, amounting to roughly 39 million LoC for a total size of 14 GB.
+D. Experiment procedure
+The experiment was performed on an AMD Opteron™ 6376 with 2.3GHz CPU, 16MB L2 cache, 64GB RAM, running Ubuntu 16.04.5 LTS. The procedure varied according to the scenario considered. More specifically:
+1) Budget scenario: We fixed a set of budgets B for
+each experimental subject (both C and Java). The budgets considered ranged between 1% and 30% of the total test suite size of each subject with a step increase of 1%. While the FAST-R approaches only required the test suite for the reduc- tion process, all competitors could take in input 3 different coverage types, namely function, statement, and branch. We therefore performed a single study for the FAST-R approaches and 3 for each of the competitors. We used each compared approach to reduce the test suite of the experimental subjects by considering all B budgets. The metrics considered were fault detection loss, preparation time, and reduction time. The measurements were repeated 50 times for each study given the stochastic nature of the approaches.
+2) Adequate scenario: The FAST-R approaches require
+coverage information for the filtering phase as an extra input to have an adequate reduction. The competitor approaches instead require exclusively the coverage information. For this scenario we considered function, statement, and branch cov- erage. We used the compared approaches to reduce the test suite of each experimental subject (both C and Java) so to maintain the coverage prior of the reduction. We measured fault detection loss, test suite reduction, preparation time, and reduction time. The measurements were repeated 50 times for each study given the stochastic nature of the approaches.
+3) Large-scale scenario: As for the budget-scenario, we
+considered a set of budgets B ranging from 1% to 30% of total test suite size of the subjects, with a step increase of 1%. In this setting we exclusively evaluated FAST-R approaches, as the other approaches require coverage information, which in this scenario is not available. To answer RQ2, we applied the approaches to the GitHub dataset for each possible reduction of B, and measured preparation time and reduction time.
+V. RESULTS
+In this section we report and discuss the results. Note that with the aim of supporting independent verification and replication, we make available the artifacts produced as part of this work [12]. The replication package includes approaches, input data, statistical analyses, and additional results.
+A. The budget scenario
+1) Fault Detection Loss: The box plots of Figure 2 display
+the FDL of the compared approaches and more details are provided in Table I. The results are grouped by programming language because the C and Java programs investigated contain different types of faults (see Section IV-C). The approaches
+c
+100 75 50 25 0
+
●●●
●●●●●●●●●●●●●●●●
●●●●●●●●
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+425
diff --git a/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt b/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt
new file mode 100644
index 0000000..30468c8
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt
@@ -0,0 +1,105 @@
+114 Telfor Journal, Vol. 11, No. 2, 2019.
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Quality Assurance in Big Data Analytics: An IoT Perspective
+Nicole Ann Fernandes and Rupali Wagh
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+115 Telfor Journal, Vol. 11, No. 2, 2019.
+Abstract —Emergence of IoT as one of the key data contributors in a big data application has presented new data quality challenges and has necessitated for an IoT inclusive data validation ecosystem. Standardized data quality approaches and frameworks are available for data obtained for a variety of sources like data warehouses, webblogs, social media, etc. in a big data application. Since IoT data differs significantly from other data, challenges in ensuring the quality of this data are also different and thus a specially designed IoT data testing layer paves its way in. In this paper, we present a detailed review of existing data quality assurance practices used in big data applications. We highlight the requirement for IoT data quality assurance in the existing framework and propose an additional data testing layer for IoT. The data quality aspects and possible implementation models for quality assurance contained in the proposed layer can be used to construct a concrete set of guidelines for IoT data quality assurance.
+Keywords — Big Data, Internet of Things (IoT), Data Quality, Data Testing, IoT data Validation, Quality of Service (QoS).
+I. INTRODUCTION
+IOdaTyolirvienst ebrunteatlsoof rthevinoglus thioansizneodt othnel ye ncthiraencgoemd pouutri ndga ya ntod analytics paradigm. Today IoT is the key contributor in
+making informed decisions across domains. With these connected devices generating enormous data, seamless integration of this data in a big data application for further analytics is the need of the hour. Since quality data is the backbone of any analytical solution, ensuring the quality of big data is a fundamental task in big data testing. Since the poor data quality may produce inaccurate results, a comprehensive data quality assurance framework is followed for big data testing [1]. The famous V’s of big data – volume, variety, velocity, and veracity bring complexities with them. This has been the reason for the inclusion of rigorous data quality check which otherwise was not required in a traditional system [2] data testing.
+Paper received October 30, 2018; revised April 4, 2019; accepted May 04, 2019. Date of publication December 25, 2019. The associate editor coordinating the review of this manuscript and approving it for publication was Prof. Miroslav Lutovac.
+Nicole Ann Fernandes is a postgraduate student, Department of Computer Science, CHRIST (Deemed to be University), Bengaluru, India (e-mail: fernandes.ann@mca.christuniversity.in).
+Rupali Wagh is Associate Professor with the Department of Computer Science , CHRIST (Deemed to be University), Bengaluru, India (e-mail: rupali.wagh@christuniversity.in).
+In the last decade, we have witnessed the dominance of IoT and today IoT has become a major contributor in the big data application environment. It brings newer complexities in the big data ecosystem. Vastly different sensors from a huge network of connected devices produce data which require careful and systematic preprocessing before actually being fed for analytics. While the wear and tear of the devices/sensors, faulty devices, etc require actions which may be extrinsic to the computing life cycle, but identification of these issues needs to be done intrinsically by analyzing the captured data. IoT is further challenged by security concerns and network issues as they directly impact the reliability and accuracy of data. Thus, the data validation for IoT data goes beyond just data cleaning, aggregation and transformation, and shifts more towards intelligent and machine learning based methods in data testing like ontologies for data abstraction and predictive methods for threat prediction. Since IoT based big data analytics is becoming more and more prevalent, the data quality issues are becoming very significant. Additionally, IoT analytics due to its ubiquitous nature impacts human life largely and hence ensuring the quality of IoT data has become very critical.
+In this paper, we discuss major data quality challenges specifically with respect to IoT data. We also elaborate the implementation models used to assure the quality of IoT data and propose an additional IoT data validation layer, which can act as a basis for constructing an IoT inclusive data quality assurance framework for any big data application.
+The paper is organized as follows- Section II elaborates a generic big data test framework, section III emphasizes the dominance of IoT data in today’s big data applications. Section IV presents data quality challenges with respect to IoT data and various implementation models and methods required for IoT data quality assurance. Section V proposes an additional layer in Big data-IoT framework
+II. BIG DATA TEST FRAMEWORK
+The variety and volume of data have become a challenging aspect to databases. With unstructured, structured, semi-structured data being produced every second, data testing is extremely complex. The 4 V’s Volume, velocity, variety, and veracity of big data demand the unorthodox form of information that enables magnified insight, decision-making. Big data testing is absolutely dissimilar from general testing scenarios as it involves processing huge data quickly for a business to make better decisions. The primary goal of big data testing is cleaning, masking, monitoring big data but none of these deals with
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Fernandes and Wagh: Quality Assurance in Big Data Analytics: An IoT Perspective 116
+data validation in a big data framework which lacks the quality of data. Big data testing is verifying data to ensure data transformation, data quality, and automate the regression testing.
+Validation of structured and unstructured data in a test environment increases cost and time. Big data testing is based on Extract, Transform and Load (ETL). In the Extract phase test data is uprooted from various sources, traditional databases like relational database management system (RDBMS), the test data and process are verified and in the transformation phase, once the transformation is successful, it is either sent to the data warehouse or deleted. Quality is a major issue and requires a peculiar infrastructure [2]. Data warehouse staging area is a short-term location where data from all sources are recorded. Since data cannot be extracted directly from all databases at the time, therefore, data in the data warehouse is momentary
+Quality Assurance (QA) defines whether a product or service meets the specified requirements. Fig. 1 describes various parameters that could cause tangible and intangible losses to an organization due to poor data quality. Unreliable data leads to wastage of resources, business revenues, decisions, productivity, and prevents data from being shared in an organization. Meeting customer requirements is far beyond the reach if data is not validated and accurate. Due to unreliable systems, low-quality data collections, unorganized data, connectivity issues, technical faults between sensors lead to business loss. Data is said to be reliable and consistent when data collected and analyzed remains substantial over time. Data quality parameters, data accuracy, data timeliness, data accessibility, data accountability, data completeness, data scalability, and data security and their significance are discussed in detail in [1], [4].
+
+Fig. 1. Data quality concerns in big data environment.
+To ensure the quality of data the following big data quality services are generically employed in a big data testing framework [1], [5], [6].
+· Data collection: Gathering and quantifying information from various sources.
+· Data cleaning: Since data is collected from various sources detecting and correcting untrustworthy, inaccurate, corrupt records data is a major role in big data testing which ensures data quality.
+· Data transformation: Process of the transfiguration of dataset from a source data system to the format of a destination data system.
+· Data loading: Once the data is transformed it is loaded into a big data repository such as NoSQL big database and Hadoop domain.
+· Data analytics: Inspection, modeling, and modification of data into reports, conclusion, supports decision- making.
+· Data aggregation: The arrangement of data from a database to develop datasets for data processing.
+With the high computing requirement and complexities of the processes in the big data testing framework, test as service (TAAS) is gaining popularity in recent years. TAAS is primarily aimed at providing solutions regarding cost, data and packet loss, and scalability issues of IoT devices and test semantic correctness and functional features remotely [2]. TAAS with IoT testing framework rectifies unnecessary cost, traditional software testing in the development of IoT devices, provides real-world testing and reduces strain on internal resources. With emerging Machine learning methods into software testing [3], software, TAAS is becoming more and more relevant [3].
+Existing comprehensive big data quality framework is primarily centered around the data coming from data warehouses, weblogs and social media. Though IoT is an inseparable component of today’s big data application, Inclusion of IoT focused data validation is not yet seen as a mandatory element in the framework.
+III. IOT KEY CONTRIBUTOR OF DATA IN BIG DATA APPLICATION
+IoT enables things to actively participate in sharing data with other objects, communication over the network (wired/wireless), recognizing changes and events in other objects where things/object can react inaccurately.
+The internet of things helps to connect anything with everything. IoT is connected to cellular services like 30% are phones, 23% tablets, and others are machine-to-machine communication. With the advancement of high-speed internet connection like Broadband connectivity, Google fiber which provides high-speed low latency network.
+As shown in Fig. 2, it is projected that IoT will grow about 267 billion in 2020 [7]. IoT generates huge information, this information is analyzed, and resets factors based on the emergency. Sensors help to detect motion; a voice call may be sent through the internet or appropriate altars are sent on devices. With the advancement of technology and the use of sophisticated sensors, IoT generated data reduces human efforts and interaction and improves decision analytics. Real Time Data generated by IoT is highly preferred for decision-making because of its high business value.
+IoT generated data is seldom analyzed independently and often exists as one component of the big data analytics ecosystem, Fig. 3. Big data and IoT is used widely across domains to provide diverse solutions. Big data analytics is used to examine huge datasets in order to uncover hidden patterns, customer requirements, market trends, business information, better agriculture planning, reduce the cost of
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+117
+Telfor Journal, Vol. 11, No. 2, 2019.
+medical systems and decision-making. There are few domains where IoT and big data analytics has become the norm for the functioning of various processes. Health gadgets with various IoT enabled sensors are becoming the backbone of patient monitoring systems and providing phenomenal support to inefficient customer care [8], [9]. IoT devices are being used to monitor and build patient- centric, remote consultation, to help critical conditioned patients [10]. Smart farming includes technologies like IoT, big data, data mining, machine learning techniques, cloud computing which enables farmers to take actions and better- informed decisions on farming practices. Sensors are used on fields and crops which provides data points on soil conditions, detailed information on wind, water availability and pest infections [9]. Sensors like SHT10, SEN0161, Humidity sensor and Obstacle sensor (ultrasonic) are used on various hardware and software that includes AVR microcontroller atmega 16/32, ZigBee module, Raspberry pi, Dip trace, SinaProg, Raspbian Operating system. Thus, it is now possible to monitor productivity with just a click of a button. Smart homes technologies include a suit of IoT devices, appliances, or systems that connect into a network and can be controlled. IoT and big data fabricate the use of accommodating new devices, appliance, and other technologies. IoT is growing exponentially, Sophisticated sensors and chips are embedded into systems that surround us in a smart home environment which comprise of Temperature sensor, Voice/Sound sensors, an Air composition sensor, Infrared sensors, pressure sensors, Video cameras for surveillance. When an unusual motion takes place, an alert message is sent to the user [11], [12], [13], [14].
+
+Fig. 2. Worldwide Diversification of IoT Devices, as projected by [7].
+Thus, the amount of data generated by connected devices is tremendously huge. Its assimilation in a big data system is further complicated by the variety, time dependency, compatibility, and interpretability.
+IV. QUALITY IOT DATA: CHALLENGES
+IoT and big data analytics has almost become omnipresent and also brings data challenges along with it. A Huge number of sensors generating an enormously high volume of diverse data requires a multifaceted data quality assurance approach. In this section, we emphasize three main characteristics of data which are essential for producing valid and applicable results namely data reliability and accuracy, data timeliness and data
+interpretability. We discuss the challenges in ensuring these qualities in IoT data and review the state of art of the solutions provided for them.
+
+Fig. 3. IoT and Big Data Analytics.
+A. Reliable and Accurate Data – IoT Security
+Security and privacy of data are very crucial to the IoT paradigm. This undoubtedly is the most researched area in the field of IoT, cloud computing and big data because of its high impact on the business value of such systems. Though the solutions to IoT security are based in multiple domains like networks and machine learning, the primary objective is to collect genuine and authentic data. Securing systems is based on a few standard principles: confidentiality, availability, authentication, integrity. Some devices used in IoT have extremely limited storage, battery power, processing rate are unable to cope with the unique security systems and wireless networks are widely used in IoT devices which could lead to packet loss. Security is a widely researched problem in IoT and main security concerns are identified as Eavesdropping, Mac spoofing, Dictionary attack, and Man-in-the-middle attack. [14], [11]. While traditional solutions include encryption and cryptography, a newer research direction based on IoE, internet of entities with blockchain based validation mechanisms is being proposed in the research community [15]. In network security for smart home, domain is proposed in [11] where communication rules for every device are installed in every home router and are further used to filter malicious traffic. The layered architecture of IoT posed challenges in providing end to end privacy and security. Improved privacy preserving the architecture of IoT as proposed in [16] is the need of the hour which is based on the concept of using multiple cloud data stores for preserving privacy. Based on this generic architecture domain specific architecture for more secure data in IoT is also proposed. Application of machine and deep learning approaches for building robust IoT big data applications [5] are effectively used for threat categorization as well as predicting the layer where the threats can surface viz, network services surface/cloud service surface/web application interface, etc.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Fernandes and Wagh: Quality Assurance in Big Data Analytics: An IoT Perspective 118
+B. Data Timelines – Real-Time Data Analytics Models a very high velocity. Recent paradigms like Resource With heterogeneous data coming continuously from Description Framework (RDF) are gaining popularity due
+multiple sources spanning multiple geographic locations, to the flexibility that they provide in the continuous query it's difficult to separate valuable data from irrelevant processing [22]. Application of semantic annotations of IoT information. IoT big data analytics is further challenged by data in healthcare domain is discussed in [23]. The paper the need for real-time data updates and its real-time shows semantic annotations of the heterogeneous data analytics due to the continuous operational state of IoT gathered using IoT devices of patients and physicians to devices, thus a “Fog Computing” lightweight computing transform the data into RDF. This data is then processed by paradigm becomes relevant for IoT. Fog computing is SPRARQL (SPARQL Protocol and RDF Query Language) similar to cloud computing which provides temporary facilitating the interoperability across devices. The concept storage, services, and application which provides a of interoperability is very much relevant in all the domains promising solution for big data applications and IoT. Fog of IoT and requires standardized data representation computing is an intermediate layer between cloud formats. These formats essentially describe data as linked computing and data generated from various sources. It objects or entities with characteristics and relationships. reduces the processing time and cost spent on sending huge Example. Ontologies are required further for knowledge data to the cloud. As fog nodes analyze all the data that sharing to interpret the data representation [24]. Semantic needs to be recorded and delivered into the cloud which is interoperability can be challenging: integration of multiple used for prediction or a historical purpose. Fog nodes data sources, a distinctive ontological point of reference, provide optimization approach for an IoT sensing P2P (peer to peer) communication, semantic discovery of application which improves data security and reduces data data sources and services. IoT interconnected devices face latency, faster response. Fog nodes analyze data with standardization and reusability issues due to unpredicted minimum requirements like power and fewer resources by faults.
+appending an appropriate sensing module. The performance
+level is reduced as data is uploaded into the fog nodes [17]. V. IOT INCLUSIVE QUALITY ASSURANCE FRAMEWORK Fog computing in IoT can eliminate the dependency on a FOR BIG DATA WITH IOT
+centralized data center and perform the in-network IoT has made a machine to machine communication computation to reduce the latency in computations. This possible. We propose an additional IoT quality assurance lightweight computation also augments security solutions layer before IoT data is integrated with the generic big data as it allows lightweight encryption schemes through fog-to- application. As shown in Fig. 4, the proposed IoT data things paradigms [18], [19]. Data generated by sensors and validation layer sits on top of the data collection layer. A devices are processed efficiently and closer to where the series of actions proposed in the layer would ensure that the data is originated instead of sending it to a diverse data raw IoT data is transformed into suitable abstraction before center as is done by edge computing. A massive amount of getting integrated into any new-age analytics model.
+data is collected and processed by edge devices locally, As shown in Fig. 4 an IoT data quality validation layer stores condemnatory data. Edge computing is closer to end can be included in Big-IoT framework immediately after users and provides Quality of Services (QoS) to end users. data collection. Before integrating raw data collected from Edge computing nodes are also called edge/cloudlet servers. IoT devices, a series of transformation and quality checks Edge servers reduce operating cost, provide real-time in the proposed layer would facilitate further analysis of this analysis, reduce network traffic and improve the data.
+performance of applications [20].
+C. Data Interpretability – Semantics of IoT Generated Big Data
+The three V’s of big data volume, velocity, and variety are inherently applicable to IoT data. Before integrating this data with other non-IoT data for further analytics, high- level abstraction of the raw IoT data can improve the interpretability of the data. IoT requires algorithms that can analyze data that comes from a variety of sources in real- time. Semantic technologies tend to enhance the abstraction of IoT data through annotation algorithms [17]. The “variety” of IoT data encompasses time series data, streaming data, geographical data, data coming from wearable devices, etc. Providing insights based on these raw values requires a plethora of algorithms. Semantic technologies for interoperability on IoT are one of the latest research field in IoT [14], [21]. Due to the heterogeneity of devices and platforms in any big data and IoT framework, augmenting data with semantics that the data represents can add a very high value to the raw data that accumulates with Fig. 4. IoT inclusive quality assurance framework.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+119
+Telfor Journal, Vol. 11, No. 2, 2019.
+Data accuracy and consistency, data timeliness and data usability are very important quality attributes and can affect the performance of an analytics application. Ascertaining these attributes for IoT data requires entirely different approaches and methods. Fig. 5 elaborates the difference between the data quality assurance methods with respect to IoT big data and non IoT big data applications for these above-mentioned quality attributes.
+Thus, IoT data needs to undergo various transformations before its assimilation into a big data analytics framework. The data quality validation layer proposed in this study aims to encompass the features of IoT data quality listed in Fig. 5. Based on various processes and methods as mentioned transformations on raw IoT data are performed wherever necessary. Seamless implementation of measures discussed with respect to every challenge mentioned in the preceding section would assure the quality of IoT data which is the primary ingredient of any new-age analytics model. An IoT data validation workflow can be designed based on this proposed validation layer to ensure that the data is ready for integration with other data in the big data ecosystem. This validated IoT data can then be integrated with HDFS, HIVE or any other big data framework for further analysis and interpretation.
+
+Fig. 5. Data quality assurance: IoT Big Data vs Traditional Big data.
+VI. CONCLUSION
+Data testing is a critically important phase in the development of big data application. IoT is a massive game changer in the modern world where sensors are the heart of IoT and big data. IoT and big data help to connect to devices to generate data to transmit, compile, and run analyses and predict and forecast new future. This paper is an effort to highlight various dimensions of the IoT data quality. The paper also highlights the requirement of a dedicated IoT data pre-processing and validation cycle for IoT data before its integration with other data in Big data IoT paradigm. Authors emphasize a smooth and continuous amalgamation of these additional processes for futuristic IoT big data applications.
+REFERENCES
+[1] J. Gao, C. Xie and C. Tao, “Big Data Validation and Quality Assurance -- Issuses, Challenges, and Needs,” 2016 IEEE Symposium on Service-Oriented System Engineering (SOSE), Oxford, 2016, pp. 433-441.
+[2] N. Elgendy and A. Elragal, “Big Data Analytics: A literature review paper,” P. Pemer (Ed): ICDM 2014, LNA 18557, PP.214-227, 2014.
+[3] J. Gao, X. Bai, W. Tsai and T. Uehara, "Testing as a Service (TaaS) on Clouds," 2013 IEEE Seventh International Symposium on Service-Oriented System Engineering, Redwood City, 2013, pp. 212- 223.
+[4] E. Ahmed et al., “The role of big data analytics in Internet of Things,” Computer Networks, vol. 129, Part 2, pp. 459-471, 2017.
+[5] M. Gudipati, S. Rao, N. D. Mohan and N. K. Gajja, “Big data testing approach to overcome quality challenges,” Infosys publication, vol. 11, pp. 65-72, 2013.
+[6] M. Mohammadi, A. Al-Fuqaha, S. Sorour and M. Guizani, “Deep Learning for IoT Big Data and Streaming Analytics: A Survey,” IEEE Communications Surveys & Tutorials, vol. 20, no. 4, pp. 2923- 2960, Fourthquarter 2018.
+[7] https://iot-analytics.com/state-of-the-iot-update-q1-q2-2018- number-of-iot-devices-now-7b.
+[8] P. Verdugo, J. Salvachiua and G. Huecas, “An agile container-based approach to TaaS,” 2017 56th FITCE Congress, Madrid, 2017, pp. 10-15.
+[9] M. Hassanalieragh et al., “Health Monitoring and Management Using Internet-of-Things (IoT) Sensing with Cloud-Based Processing: Opportunities and Challenges,” 2015 IEEE International Conference on Services Computing, New York, NY, 2015, pp. 285- 292.
+[10] H. Kim et al., “IoT-TaaS: Towards a Prospective IoT Testing Framework,” in IEEE Access, vol. 6, pp. 15480-15493, 2018.
+[11] R. Kumar, et al., “Monitoring system using android App”, ARPN Journal of engineering and applied sciences, vol 12, no 19, pp. 5647- 5652, October 2017.
+[12] C. Bekara, “Security Issues and Challenges for the IoT-based Smart Grid,” Procedia Computer Science, vol. 34, pp. 532-537, 2014.
+[13] P. Bhardwaj et al., “A review paper on smart home automation”, International Journal of Scientific Research and Management Studies (IJSRMS), vol. 3, no. 6 pp. 246-250, January 2017.
+[14] Z. Khan, Z. Pervez, A. G. Abbasi, “Towards a secure service provisioning framework in a Smart city environment,” Future Generation Computer Systems, vol. 77, pp. 112-135, 2017.
+[15] M. Sripan, X. X. Lin, P. Petchlorlean and M. Ketcham, “Research and thinking of smart technology,” International conference on the system and electronic engineering, December 18-19, 2012.
+[16] R. Saia, “Internet of Entities (IoE): a Blockchain-based Distributed Paradigm to Security,” arXiv:1808.08809v1.
+[17] A. Čolaković and M. Hadžialić, “Internet of Things (IoT): A review of enabling technologies, challenges, and open research issues,” Computer Networks, vol. 144, pp. 17-39, 2018.
+[18] C. Mankar et al., “Internet of Things (IoT) an Evolution,” International Journal of Computer Science and Mobile Computing, vol. 5, no. 3, pp. 772-775, March 2016.
+[19] G. Sabarmathi, R. Chinnaiyan, and V. Ilango, “Big Data Analytics Research Opportunities and ChallengesA Review,” International Journal of Advanced Research in Computer Science and Software Engineering, vol. 6, no. 10, pp. 227-231, October 2016.
+[20] W. Yu et al., “A Survey on the Edge Computing for the Internet of Things,” in IEEE Access, vol. 6, pp. 6900-6919, 2018.
+[21] C. Maple, “Security and privacy in the internet of things,” Journal of Cyber Policy, vol. 2, no. 2, pp. 155-184, 2017.
+[22] S. Pacha, S. R. Murugan and R. Sethukarasi, “Semantic annotation of summarized sensor data stream for effective query processing,” J Supercomput, 2017.
+[23] P. Murdock ed., “Semantic Interoperability for the web of Things,” DOI: 10.13140/RG2.2.25758.13122, August 2016.
+[24] M. Harlamova, M. Kirikova and K. Sandkuhl. “A Survey on Challenges of Semantics Application in the Internet of Things Domain.” Applied Computer Systems, vol. 21, pp. 13-21, 2017.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt b/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt
new file mode 100644
index 0000000..38387bb
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt
@@ -0,0 +1,114 @@
+Int J Syst Assur Eng Manag
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Software (OSS). The Open Source Software is now a movement and has seen an exponential growth in spread and depth; riding the wave of phenomenal growth in net- works and internet related technologies. The origin of OSS can be traced back to 1970s, when Richard Matthew Stallman, often known by his initials, RMS propounded the concept of OSS. RMS believed that both software and
+& Ranjan Kumar
+ranjan301@gmail.com
+Subhash Kumar subhashkumar@andc.du.ac.in
+Sanjay K. Tiwari tiwari.dr.sanjay@gmail.com
+https://doi.org/10.1007/s13198-019-00777-x
+ORIGINAL ARTICLE
+A study of software reliability on big data open source software
+Ranjan Kumar Department of Computer Science, Aryabhatta College
+(University of Delhi), Benito Juarez Marg,
+software development, intrinsically by their nature belongs to the body of knowledge for the humankind and thus must be shared freely. RMS introduced the free version of the
+New Delhi 110021, India
+ • Subhash Kumar Department of Physics, Acharya Narendra Dev College
+(University of Delhi), Govindpuri, Kalkaji,
+widely used Unix operating system under GNU (Stallman 1998). Freedom the core concept of OSS, according to RMS was seen as a fundamental component of free speech
+New Delhi 110019, India
+ • Sanjay K. Tiwari Post Graduate Department of Mathematics, Magadh
+University, Bodh Gaya, Gaya, Bihar 824234, India
+and strongly advocated sharing of the software s code and
+123
+
+
+Received: 9 May 2018/Revised: 10 December 2018
+ The Society for Reliability Engineering, Quality and Operations Management (SREQOM), India and The Division of Operation and Maintenance, Lulea University of Technology, Sweden 2019
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Int J Syst Assur Eng Manag
+Abstract With the increasing use of Open Source Soft- ware (OSS) in high speed networking, parallel processing and distributed computing, OSS has emerged as main- stream in the last decade and is now being broadly accepted even by the traditional proprietary software development companies. The major advantages of OSS over traditional software development are less development cost, avail- ability of source code, quality and security. Software reli- ability an important attribute of software quality, is defined as the probability that a software will operate free of failures or breakdown for a specified time under speci- fied conditions (IEEE Std. 1633-2016). Investigation of Software reliability with the help of software reliability models (SRM) undertakes the estimation and prediction of the failure phenomenon of a software. In this paper we have investigated whether Non-homogeneous Poisson process (NHPP) based software reliability models fit in the big data open source software fault/bug data. We have extracted real and latest bug/fault data of Hadoop and
+Spark open source big data applications, from bug track- ing/management tool Jira. For this purpose, we have also compared these models on different goodness-of-fit and prediction criteria based on collected failure data to ascertain whether a best fitted model can also be a best predictor. It is found that the best model fitting the failure data is not a best predictor model.
+Keywords Bug Goodness of fit NHPP OSS 1 Introduction
+The last decade has witnessed rapid and profound devel- opment in computer networking and internet related tech- nologies. This has heralded a new dimension to the entire gamut of software development. It has given a decisive impetus to the development of an entirely new ecosystem wherein the development process of software is essentially concurrent and distributed in nature the Open Source
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Int J Syst Assur Eng Manag
+the associated idea. The salient attributes of open source software involves possession of certain sacred and free rights viz. right to use, right to reproduce, right to modify and right to distribute the software. It has to be realized that free in this praxis is not synonymous in the economic sense, rather it refers to free as in freedom to do certain acts in the software development process and doing away with restrictions which generally accompany the propriety software. This model of software development results in a more robust and reliable software; which is not only reli- able but also more efficient and productive. This model promotes transparency in projects and thereby minimizes risk in the development process of the software. The phi- losophy and practice of OSS was firmlyestablished by Eric Raymond in his seminal paper The Cathedral and the Bazaar (Raymond 1999). In this essay and later a book Eric Raymond likened the propriety software to the the Cathedral model whereas the OSS development to the Bazaar model and argued that these two models are based on antagonistic assumptions about the nature of the debugging task in software. The process of development of OSS imparts myriads of advantage to its products when compared to the commercial propriety software. The OSS are found to have fewer bugs, have better reliability, are free from vendor s lock-in periods and thus are free from vendor dependence. The OSS possesses better and quick support as they belong to the community rather than to a firm.These products also have educational value. A critical analysis of the claims of the suitability of OSS due to these factors has been taken up (Ven et al. 1998). It has been found out that indeed certain factors like economical products, availability of source code, support by the com- munity, independence from vendor lock-in and maturity of software do put OSS to advantage vis-a‘-vis commercial software.
+Having said that, the quality of software remains a prime concern. It is important because it brings out the extent up to which the software meets the user s requirement. Therefore, qualitative and quantitative assessment of the software has attracted a lot of attention. Studies which discern the quality of the software include empirical studies and mathematical modeling. Out of the various tools available for quantitative assessment of software, the exponential model also known as reliability growth model and Software Reliability Model (SRM) are ubiqui- tously utilized. While the exponential model models the appearance of defects at the backend of the development for projecting failure pattern in the field, the SRM fixes a definite probability for the software causing a system failure over some specified operating period. A large body of empirical data supports both of these models.
+Software Reliability Model (SRM) has emerged as a key indicator as well as predictor for determining the quality of
+software as soon as the software is launched in the market. By definition, SRM is a mathematical expression which provides the generic form for appearance of bug in the software as a function of bug detection, bug correction and the operational environment (Std 1633). SRM is utilized to assess as well as predict reliability of a product. For assessment of reliability SRM seeks to fitthe data extracted for the failure of software using various statistical tech- niques like linear regression or non-linear regression. The choice of technique obviously depends upon the behavior of extracted data. For the purpose of predicting the relia- bility of the software, the expected number of bugs is estimated through fitted SRM (Lyu 1996; Yamada 2014).
+The issue of reliability in case of OSS has also received some attention. Several hypotheses have been proposed to investigate the relationship, if any, between reliability and openness (Joode and Bruijne 2006). A study on OSS pro- ject s bug data has however, concluded that the traditional software reliability growth model cannot be applied for the assessment of the reliability growth of OSS because the software development paradigm of an OSS is intrinsically different from proprietary software and further goes on to suggest an alternative approach for assessment of OSS products (Zou and Davis 2008). OSS has been subjected to quality assessment quantitatively using alternative approaches (Tamura and Yamada 2009, 2010; Zhou 2005). Studies on bug tracking data of few popular OSS reveals that the OSS projects as well as closed source projects (CSS) show similar reliability growth pattern (Singh et al. 2010a, b). This has been further confirmed by the Non- homogeneous Poisson process (NHPP) based reliability models wherein similar reliability growth curve have been reported for OSS as well as CSS (Singh et al. 2010c, d). This raises the relevant question that if from a reliability point of view, the OSS behaves in the same way as CSS, then which model is most appropriate for its assessment? The bug detection rate of two OSS projects examined with in house developed software using two SRMs found that the two OSS projects exhibited different profiles of bug arrival behavior (Syed-Mohamad 2008). By analyzing six OSS projects bug data Zhou (2005) found that OSS and CSS projects exhibit a similar pattern of reliability growth. They used general Weibull model to fit bug occurrence of OSS projects. The Weibull distribution has also been also suggested by Rossi (2010) as the best model for OSS by analyzing the bug occurrence behavior of three OSS pro- jects applying SRM. On the contrary, Rahmani (2010) discovered a fundamentally different result by using 3 models and dataset of 5 OSS projects bug data. They found that the Weibull was the worst model. By modeling of the bug reports using nonparametric techniques for the six OSS projects bug data Zou (2008) observed that exponential smoothing methods and Generalized Additive models are
+better suited for reliability of OSS products. For reliability classification of OSS products, SRMs can be used suitably (Li et al. 2011).
+It is evident that a plethora of models for software reliability is available in the market as well as in the lit- erature. Many of these models are based on Non Homo- geneous Poisson Process (NHPP). In these models, failure process is assumed to follow a non-homogeneous Poisson process. These SRMs generally have an intensity function or the rate of bugs/failures in the software given by a power law polynomial and display a great degree of flexibility in application. For the commercially available traditional software, these NHPP models have been found to be suc- cessful and have been widely utilised for software relia- bility studies. However, it remains to be discerned whether these models for software reliability can also be used gainfully for the same purpose in case of OSS. The aim of the present study is to investigate the suitability of NHPP based SRMs on OSS in general and Big data OSS Spark and Hadoop in particular. The rest of the paper is organised as follows. In Sect. 2, some chosen SRMs which are widely used and are based on NHPP are introduced along with their characteristic functions. These models undergo evaluation or validation in Sect. 3 on two data sets on bugs/failures of two popular Big data OSS Hadoop and Spark. In this section, analysis of the data sets includes parameter estimation for the respective models. This is followed by comparison of models using Goodness-of fit criterion. The analysis also probes the assessment and predicting abilities of these SRMs for the representative datasets of the bugs reported in the chosen big data OSS. Here the criterion of goodness of fit implies how well a model predicts the dataset which has already been utilized to estimate its parameters, while how well a model predicts new data points is said to be its predictive capability i.e., predicting unseen data in future. Section 4, presents the results and interpretation of the analysis carried out in the present investigation.
+2 NHPP models
+NHPP models considers the number of faults per unit time as an independent Poisson random variable which evolve by a non homogeneous Poisson process (Yamada 2017). NHPP models have been very successful and are amongst the widely applied models for software reliability studies. The reasons behind popularity of NHPP are follows:
+(i) These are categorized by a mean value function, m(t), which help in calculating expected number of bugs up to time t very easily.
+(ii) Parameters of the model can also be computed very easily.
+(iii) NHPP models are closed under time transforma- tion and superposition (Lai and Garg 2012).
+Here we consider five well known conventional NHPP models to measure and evaluate them on two well estab- lished big data open source projects viz. Hadoop and Spark. Analysis is carried out to findout (i) whether they fit on them and (ii) whether a best goodness-of-fit model can also be a best predictor model. The five models chosen for present study are briefly described below:
+2.1 Goel Okumoto (GO) model (Goel and Okumoto 1979)
+It is an exponential NHPP model developed by Goel and Okumoto in 1979. It was proposed on the assumption that whenever a bug is detected, it is corrected in no time and all detected bugs are mutually independent to each other.
+2.2 Kapur and Garg (KG) model (Kapur and Garg 1992; Kapur et al. 2011)
+The model, proposed by Kapur and Garg in 1992 assumes that during the debugging process some additional errors/faults may also be corrected, while removing the bonafide failures. While the bonafide failures are termed as independent faults, the additionally removed faults are deemed to be dependent faults.
+2.3 Yamda delayed S-shaped (YDS) model (Yamada et al. 1983)
+Yamda proposed this model in the year 1984 with a modification of NHPP model. It is also considered as generalized exponential model with the assumption that the behavior of bug arrival pattern first increases and then decreases to obtain S-shaped curve. A software bug detection process is described by failure detection process and bug isolation process.
+2.4 In ection S-shaped model (ISM) (Ohba and Osaki 1984)
+The model was developed by Ohba in 1984 and it is based on the dependency of faults with the assumptions: a) bug detection rate of each bug is constant, b) the isolated fault can be fully removed and some faults cannot be detected before removing some other faults.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+123
+Int J Syst Assur Eng Manag
+2.5 Pham—Nordmann—Zhang (PNZ) model (Pham et al. 1999)
+This model was proposed by Pham in the year 1999 which considered imperfect debugging situations with the assumption that during debugging new bug can appear with the constant bug detection rate.
+The mean value function, mðtÞand intensity function kðtÞare the two characteristic functions which constitutes the building block of all the above models based on NHPP. While mðtÞis the mean value function of the expected number of faults/bugs which have been detected/removed in the time interval [0, t], the failure intensity function
+kðtÞ ¼dmðtÞ measures the instantaneous rate of change of
+dt
+the expected number of failures i.e., mðtÞat time t, given that the system has not failed up to time t. Table 1, enu- merates the characteristic functions of the NHPP models chosen in the present study. Here n is total number of expected fault, f is bug detection rate, c is bug inclusion rate and q represents the dependent bug detection rate.
+3 Model evaluation/validation
+Once mathematical models have been selected, they are evaluated for its ability to fit the historical failure data of the software i.e., Goodness of fit.Additionally, they need to be further evaluated for their ability to predict occurrences of failures of the software in future i.e., predictive capa- bility. For this purpose, it involves estimation of the unknown parameters of the chosen models. As the NHPP- based software reliability are described by non-linear functions, Non-linear least square (NLLS) and Maximum likelihood estimate (MLE) techniques are used to estimate the unknown parameters for these models on actual data- sets for software failures (Kapur et al. 1999). After esti- mation of the parameters are validated on the given dataset to find out their fitting and predictive capabilities. We have
+carried out data analysis on two real datasets of under consideration models using R language which is not only an open source software but also one of the most efficient and popular data analysis tool.
+3.1 Data set
+Among several open source software related to Big Data, we have selected here two most widely used and estab- lished tools for analyzing big data Hadoop and Spark. Among the repositories of the issues for Hadoop and Spark, the present study focused on only those issues that were declared bug . Other type of issues like improvement ,
+ wish , new feature , task or patch were excluded
+so that we could deal exclusively with proper failures. Among the data classified as bugs, we have further filtered it and selected the bugs having status as closed . This means those bugs which have been resolved and verifiedby the reporter have been only considered in the analysis. The dataset was also further processed and cleaned with reso- lution defined something like cannot reproduce , du- plicate , won t fix or others. Table 2 illustrates our choice of data after processing.
+Data have been downloaded from issues tracking and management tool Jira s website (Apache Website 2018). Although Hadoop has four components, we have only considered and extracted Hadoop common component s bug data. Total of 406 failures were observed in dataset D1 and 375 failures in D2. Detailed month wise bug detection pattern for Hadoop and Spark are shown in Fig. 1.
+3.2 Parameter estimation
+For calculation of the estimated bugs it is important to first compute the values of unknown parameters in the mean value function. Parameter estimation is generally done by using two estimation techniques; Non Linear Least Square (NLLS) and Maximum Likelihood Estimate (MLE) (Kapur et al. 2011). Since data is irregular in nature, we have used
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+123
+Int J Syst Assur Eng Manag
+Table 1 Summary of NHPP
+Model Model name Mean value function m(t) models with mean value
+function GO Goel-Okumoto (Goel and Okumoto 1979) mðtÞ ¼n 1 e ft
+KG Kapur Garg model (Kapur and Garg 1992)
+a 1 eð ðfþqÞtÞ mðtÞ ¼
+1 þ q eð ðfþqÞtÞ
+f
+YDS Yamda Delayed S-shaped (Yamada et al. 1983) mðtÞ ¼n 1 ð1 þ ftÞe ft ISM Inflection S-shaped (Ohba and Osaki 1984) nð1 e ft Þ
+mðtÞ ¼ 1þ ce ft
+PNZ Pham PNZ model (Pham et al. 1999) mðtÞ ¼nð1 e 1ftþÞdeð1 ftf Þþcnt
+c
+
+Table 2 Collection of bug data for two OSS
+OSS Project
Dataset
Issue type
Status
Resolution
Period
Hadoop Common Spark
D1 D2
Bug Bug
Closed Closed
Fixed Fixed
April 2014 to Dec. 2017(45 months) Sept. 2012 to Dec. 2017 (64 months
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+123
+Int J Syst Assur Eng Manag
+
+Fig. 1 Bug arrival pattern of Hadoop and Spark
+the nonlinear function in R to calculate value of estimated parameters. It uses maximum likelihood method. The result of computed estimated value of parameters of dataset D1 and D2 are shown in Tables 3 and 4.
+3.3 Comparison criteria of models
+For the purpose of comparison among the various NHPP based SRMs considered here vis-a‘-vis their suitability in fitting to the bug data of the two OSS under investigation, the following criteria have been utilised.
+3.3.1 Goodness-of-fit criterion
+Goodness-of-fit denotes how good does a mathematical model fit to a given data .
+3.3.1.1 Akaike information criterion (AIC) AIC is used to select the best model among all those models whose unknown parameters are estimated by maximum-likelihood method.
+Table 3 Estimated parameters for dataset D1
+
+Model
n
f
c
d
q
GO
417.458
0.1056
KG
401.014
0.064
0.147
YDS
400.238
0.2447
ISM
401.014
0.211
2.295
PNZ
355.58
0.307
0.004
4.806
Table 4 Estimated parameters for dataset D2
+
+Model
n
f
c
d
q
GO
287.47
0.058
KG
363.065
0.00012
0.266
YDS
620.95
0.037
ISM
363.065
0.266
2373.89
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+123
diff --git a/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt b/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt
new file mode 100644
index 0000000..d94658c
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt
@@ -0,0 +1,180 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+SPECIAL SECTION ON INNOVATION AND APPLICATION OF INTELLIGENT PROCESSING OF DATA, INFORMATION AND KNOWLEDGE AS RESOURCES IN EDGE COMPUTING
+Received August 9, 2019, accepted August 19, 2019, date of publication August 23, 2019, date of current version September 9, 2019. Digital Object Identifier 10.1109/ACCESS.2019.2937107
+Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+CHUANQI TAO 1,2,3 , JERRY GAO4, AND TIEXIN WANG1,2
+1College of Computer Science and Technology, Nanjing University of Aeronautics and Astronautics, Nanjing 210016, China
+2Ministry Key Laboratory for Safety-Critical Software Development and Verication, Nanjing University of Aeronautics and Astronautics, Nanjing 210016, China 3State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210093, China
+4Department of Computer Engineering, San José State University, San Jose, CA 95192-01809, USA
+Corresponding author: Chuanqi Tao (taochuanqi@nuaa.edu.cn)
+This work was supported by the National Key Research and Development Program of China under Grant 2018YFB1003900, in part by the National Natural Science Foundation of China under Grant 61402229 and Grant 61602267, in part by the Collaborative Innovation Center of Novel Software Technology and Industrialization, in part by the Fundamental Research Funds for the Central Universities under Grant NS2019058, and in part by the Open Fund of the State Key Laboratory for Novel Software Technology under Grant KFKT2018B19.
+ABSTRACTWith the fast growth of articial intelligence and big data computing technologies, more and moresoftwareservicesystemshavebeendevelopedusingdiversemachinelearningmodelsandtechnologies to make business and intelligent decisions based on their multimedia input to achieve intelligent features, such as image recognition, recommendation, decision making, prediction, etc. Nevertheless, there are increasing quality problems resulting in erroneous testing costs in enterprises and businesses. Existing work seldom discusses how to perform testing and quality validation for AI software. This paper focuses on quality validation for AI software function features. The paper provides our understanding of AI software testing for new features and requirements. In addition, current AI software testing categories are presented and different testing approaches are discussed. Moreover, test quality assessment and criteria analysis are illustrated.Furthermore,apracticalstudyonqualityvalidationforanimagerecognitionsystemisperformed through a metamorphic testing method. Study results show the feasibility and effectiveness of the approach.
+INDEX TERMS
+AI software quality validation, AI testing, testing AI software.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+I. INTRODUCTION
+With the fast advance of big data analytics and AI tech- nologies, numerous AI-based software and applications have been widely accepted and used in people's daily life. AI soft- ware and applications are developed based on state-of-the-art machine learning models and techniques through large-scale data training to implement diverse articial intelligent fea- tures and capabilities. Current AI-based software and appli- cations are classied such as natural language processing systems, object recognition systems, recommendation sys- tems, unman-controlled vehicles and so on. Therefore, how to perform quality validation for AI software becomes a critical concern and research topic from both academic and industrial focuses. According to the report [1], the automa- tion testing market size is expected to grow from USD 8.52 Billion in 2018 to USD 19.27 Billion by 2023, at a Compound Annual Growth Rate (CAGR) of 17.7% dur-
+The associate editor coordinating the review of this article and approving it for publication was Honghao Gao.
+ing the forecast period (20182023). Based on recent test- ing experiences from industry on AI applications such as intelligent mobile apps, testing AI software has new prob- lems, challenges, and needs due to their special features below.
+- Scientic-based development instead of engineering-
+based development - Most AI software and applications are developed using scientic approaches based on AI models and training data by data scientists and big data engineers without well-dened AI software engineering process and development methods with clear quality validation require- ments and criteria.
+- Limited data training and validation - AI software is
+built based on machine learning models and techniques, and trained and validated with limited input data sets under ad- hoc contexts.
+- Data-driven learning features - These features provide
+static and/or dynamic learning capabilities that affect the under-test software results and actions.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+120164 This work is licensed under a Creative Commons Attribution 4.0 License. For more information, see http://creativecommons.org/licenses/by/4.0/ VOLUME 7, 2019
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+- Uncertainty in system outputs, responses, and decision
+makings - Since existing AI-based models are dependent on statistics algorithms, this brings the uncertainty in the outcomes of AI software.
+These unique AI software features above cause new dif- culties and challenges in testing and quality validation. Therefore, AI quality validation and assurance becomes a critical concern and a hot research subject. Although there havebeenmanypublishedpapersaddressingdataqualityand qualityassuranceinthepast[2][4], seldomresearchesfocus on validation for AI software from function or feature view. There is an emergent need in current research to quality vali- dation issues and quality assurance solutions for AI software and applications. Testing AI software can be considered as diverse testing activities with the intent of nding AI-based software bugs (errors or other defects), verifying that the AI-based software products are t or use, assuring AI func- tionalfeatures'adequatequalityandAIsoftware'sQoS(qual- ity of system service) parameters [41], [43]. Well-dened quality validation models, methods, techniques, and tools mustbedevelopedandappliedforAI-basedsoftwaretofacil- itate the test activities to achieve well-dened test require- ments and meet pre-selected adequate testing criteria and quality assurance standards. Typical issues of quality assur- anceandvalidationforAIsoftwareandapplicationsarelisted below.
+- How to perform quality assurance for big data which
+couldbeutilizedastrainingdataortestingdataforintelligent algorithms?
+- How to make quality validation for application service,
+e.g. what is the precision of the recommendation service?
+- How to validate the quality of diverse intelligent algo-
+rithmsandmodels,suchasdataminingandmachinelearning methods.
+This paper is written to provide our perspective views on AI software (specic to feature or function) testing for quality validation. The paper is organized as follows. Section II discusses the tutorial concepts about AI software testing, including test focuses, features, and requirements. Section III reviews AI-based machine testing, AI software function testing, as well as the existing testing methods potentially-used for AI software validation. Section IV dis- cusses AI software testing quality parameters and evaluation as well as test coverage analysis. Section V presents case studies on an image recognition system using the proposed quality validation approach. The conclusion remarks are in Section VI.
+II. UNDERSTANDING AI SOFTWARE TESTING
+Why do we need AI software testing? The fast-growing AI software and the popularity of big data-based applications bring new needs and motivations. Numerous current and future software will be built with AI-based features and functions. Existing techniques and tools are not adequate to test AI-based features and functions. There are a lack of well-dened and experience-approved quality validation
+
+FIGURE 1. The scope of AI software testing.
+models and assessment criteria. In addition, there is a lack of AI-based testing methods and solutions for AI software. Thus, the meaning of testing AI software is illustrated in a denition below.
+``Testing AI software refers to diverse testing activities for AI-based software/systems. Well-dened quality valida- tion models, methods, techniques, and tools must be devel- oped and applied for AI-based software to facilitate the test activities to achieve well-dened test requirements and meet pre-selected adequate testing criteria and quality assurance standards.''
+Therefore, testing AI features of the software includes different testing activities to nd software errors, verify the performance of software, and assuring quality validation methods need to be developed. The testing goal is to achieve well-dened test requirements, meet pre-dened testing cri- teria, and standards of quality assurance of the under-test AI software.
+A. TEST SCOPE AND MAJOR FOCUSES
+Since AI software is built with diverse machine learning models and data-driven technologies, the scope of AI soft- ware testing should cover current typically-used intelligent features, such as prediction, recognition, and recommenda- tion. Fig. 1 shows the primary scope of AI software test- ing. Objects (human, animal) related testing such as object identication, recognition, and behavior detection are an important part of AI software testing. Various intelligent applications such as business decision, recommendation and selection [35], [36], [45], intelligent commands and actions, analytics and prediction capability [37], [38], [40], [46], as well as question and answer capability are current key AI testing topics. In addition, with the advance of unmanned vehicles and their potential huge markets, how to perform control validation and healthcare check will be a big chal- lengeforAItestingandqualityvalidation.Moreover,AIsoft- ware usually involves context issues, such as scenario, loca- tion[35],time,andstakeholders,therebycausingnewtesting issues in context identication and classication. The major focuses of AI software testing are summarized as follows.
+(a) Testing AI functional features to assure their adequate quality in accuracy, consistency, relevancy, timeliness, cor- rectness, and so on using data-driven and AI approaches.
+(b)Testing AI software's quality of system service param- eters based on well-dened quality standards and assessment criteria. These include system performance, reliability, scal- ability, availability, robustness, and security, and etc.
+(c) Apply data-driven AI techniques to facilitate AI testing
+processes and test automation.
+B. NEW TESTING FEATURES AND REQUIREMENT ANALYSIS FOR AI SOFTWARE
+As discussed above, AI software and applications have numerous unique testing features such as uncertainty and limited training/test dataset. These unique features bring more interesting quality validation and QoS requirements, challenges, and needs. Based on the recent feedback from engineers at Silicon Valley, how to assure the quality of AI software becomes a critical concern and research subject cur- rently. The primary testing features are presented as follows.
+Multiple dimension-based rich media input data with multi-input models. This refers to new testing solutions to deal with multi-dimensional large-scale input data sets (such as numerous image graphs and videos) of AI software. For example, the well-known AI application Seeit1 supports text, graph, voice, and audio with diverse input domains both ofine and online.
+Test data set selection from big data pools. This refers to test data selection to address the special testing features of AI software. In traditional software, test data is used for nding software bugs. Nevertheless, in AI software, test data is not just used for functional or program bugs. Bugs or defectsexistedintrainingandlearningmodelsinAIsoftware are also needed to be discovered using specic test data. A typical face recognition application `how old do I look' from Microsoft2 can be tested with thousands of pictures to indicate its correctness and accuracy. However, how to select effectivetestdatatodiscoveritsidenticationproblems,e.g., the accuracy of `how old do I look' is affected by lighting condition or background objects. Furthermore, bugs from models or learning algorithms can be detected with more test data with specic goals.
+Knowledge-based AI software features and behaviors This refers to apply the domain-specic knowledge to assist in testing correct and precise AI software features and behav- iors.
+Uncertainty of AI software features and behaviors. This refers to how to dene and modeling testing objects in a certain way and obtain testable functions through different test strategies, such as metamorphic testing, mutation testing, and fuzzy testing.
+Learning-basedAIsoftwarefeaturesandbehaviors. This referstondingnewtestingapproachestoaddresstheleaning
+1https://itunes.apple.com/cn/app/seeit/id721911549?lDen&mtD8 2https://www.how-old.net/
+
+FIGURE 2. A sample object model-based AI software.
+features of AI software. For instance, the learning capa- bility of AI software is needed to be tested in an evolved environment.
+Real-time context-based diverse inputs affecting system outputs, actions, and behaviors. This refers to modeling complex context factors in a real-time instance, and analyze the relationship among diverse contexts, inputs, outputs, and actions.
+After identifying the primary AI features, AI function features are analyzed for testing. For each identied feature, AI testing requirements are needed to analyze for future testing. For example, before testing an object of AI software, in order to facilitate function or scenario testing, diverse features are required to classify with a well-dened category. Test models are necessary to represent the diverse features under testing. In general, models can be constructed from different perspectives for AI software, such as a knowledge test model, feature test model, object test model, and data test model. As shown in Fig. 2, features of object relation, object identication, object behavior, object classication, and object context are selected for function testing with diverse sub-features.
+In general, AI software needs to be tested at both function and system levels. Test planning, test modeling, test design, and test execution are the indispensable parts of the overall testing process for both AI software and traditional software. Since AI software has special features such as non-oracles, timeliness, and learning capability, here function test quality evaluationisaddedparticularlyasthenalstepofAIsoftware testing process. In this step, different quality parameters are measuredusingthepre-denedqualitymetricsbasedontest- ing result analysis. If the evaluation results are not accepted by stakeholders, the testing step goes to test modeling again for a new testing iteration.
+III. AI SOFTWARE QUALITY VALIDATION CATEGORY AND APPROACHES
+This section rstly illustrates a category of AI software test- ing, including Turing testing, testing AI software, AI-based software testing and AI-based machine testing. Then several existing and potential approaches to AI software testing will
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+120167
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+be presented and discussed. Moreover, test quality evaluation and test adequacy analysis are illustrated.
+A. TURING TESTING
+Turing test was introduced by Turing as the imitation game in 1950 [5], aiming to test a machine's ability to exhibit intelligent behavior equivalent to, or indistinguishable from, that of a human. Turing proposed that a tester would ask the testee freely through some devices (such as a keyboard) in the case where the tester is separated from the testee (one person and one machine). After multiple tests, if more than 30% of the testers are unable to determine whether the testee is ahuman or a machine, then the machine passes the testand isconsideredtohavehumanintelligence.Theturningtesthas been considered as the ``beginning'' of articial intelligence (AI) [6], and it has also become an important concept related to AI system testing. Although the Turing test was designed to advance the development of articial intelligence, it also has several shortcomings [7].
+B. AI SOFTWARE TESTING
+In this section, the main focus is on validating AI software functions, external behaviors, and external visibility of QoS usingblack-boxtestingtechniques.Totestsoftwarefunctions and features, engineers could adopt convention black-box approaches to validate software quality. Typical examples include scenario analysis, decision table testing, equivalence partitioning,boundaryvalueanalysis,cause-effectgraph,and so on.
+However, AI software testing differs from traditional soft- waretesting,sinceAIapplicationsarecharacterizedbyuncer- tainty and probabilities, dependence on big data, random input/output,difcultyinpredictingallapplicationscenarios, andconstantself-learningfrompastbehavior.Inrecentyears, many studies have worked on researching how to test AI software or systems [7][11].
+Broggi et.al proposed the Public Road Urban Driverless (PROUD) test conducted in Parma from the uni- versity campus to the town center through different scenar- ios such as urban, rural, and highway roads [7]. Similarly, Li et al. [8] indicated the difculties of intelligence tests from four aspects and presented an example of how to design intelligence tests for intelligent vehicles. The authors gave the denition and generation of intelligence test tasks for vehicles to combine the benets of scenario-based test- ing and functionality-based testing approaches based on a semantic relation diagram for driving intelligence proposed in [9]. In addition, the authors applied the parallel learning method to the vehicle intelligent test and proposed a par- allel system framework that combined the real-world and simulation-world for testing [10], [11].
+As discussed above, the process of testing AI functions includes test planning, test modeling, test case generation, testexecution,andtestqualityevaluation.Decisiontabletest- ing design technique determines the different combinations of inputs with their associated outputs and implements the
+TABLE 1. A sample traditional scenario analysis on siri.
+
+business requirements or rules of the system. It is also a represented type of cause-and-effect testing or logical test- ing. Black-box testing is used to test the end-user require- ments [12], [13]. It attempts to uncover the errors in the followingcategories:missingorincorrectfunctions,interface errors, behavior or performance errors, and initialization or termination errors.
+Let us take Siri3 from Apple for instance. The functions of Siri based on voice command input are listed as below: received voice commands, convert voice commands into text commands (display entered commands), nd the text response and actions that match the recognized commands, text response, action response. To verify the AI functions of the software, the traditional scenario analysis method is applied to analyze the scenarios of applications and test whether the main functions are implemented correctly from the perspective of the scene. Table 1 shows a description of ve scenarios in testingSiri.
+Based on the analyzed results and testing experiences, we conclude that the test cases designed by scenario analysis are practical and effective to validate common features and conditions. However, there are some defects to generate test cases using scenario analysis as follows.
+a. As a typical intelligent software application with AI
+features, Siri has rich context information. The different test contexts affect the results of testing Siri, such as the back- ground noise, the tester's gender, age, and accent.
+However, the traditional scenario analysis does not consider these external conditions for testing. Hence, the designed use cases are incomplete, and the execution results of some test cases failed.
+b. Advanced AI software or systems have the ability to
+learn from data and experiences. Furthermore, some AI sys- tems even learn from environmental interactions and learn
+3https://www.apple.com/siri/
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+120169
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+dynamically during interaction with users. Thus, the more time you spend on using Siri, the better it will understand you. Siri achieved this by learning about your accent and some other characteristics of your voice. Therefore, if the sametesterrepeatedlytestsSiriforthesamevoicecommand, its overall recognition of dialects and accents will continue to improve, test results will be also affected. Unfortunately, traditional scenario analysis does not take this into account.
+In order to test the voice-command-based AI functions more precisely, we should take different voice testing envi- ronments into account with context factors and modeling multi-dimensional testing space for AI features. Currently, we are working on this in another paper.
+C. AI-BASED SOFTWARE TESTING
+AI-based software testing refers to the leverage and appli- cations of AI methods and solutions to automatically opti- mize a software testing process in test strategy selection, test generation, test selection and execution, bug detection and analysis, and quality prediction [39], [42], [47]. It includes different testing activities in AI-based software testing. Due to the complexity of AI software and applications, traditional methods and test tools cannot meet the demands of testing these AI systems. Given this, a more effective method to test AI systems is desirable.
+To deal with this problem, Souri et al. [14] used an AI-based testing technique named as Multi-Objective Genetic algorithm (MOGA) to reduce the number of test cases for testing web applications yet achieve maximum coverage with reduced cost, time and space. Considering manual testing is a tedious and time-consuming task, and it may also result in insufcient testing being performed and critical defects going unidentied, Straub and Huber [15] proposedanarticialintelligencetestcaseproducer(AITCP) to test articial intelligence system (AIS). AITCP starts from a human-generated test scenario and makes changes to it based upon a modication algorithm such as ant colony opti- mization and genetic approaches. The authors compared the resultsoftheAI-basedmethodandthemanual-basedmethod fortestinganautonomousnavigationcontrolsystembasedon selected four scenarios. The study results show that AITCP can be utilized to effectively test AIS for both surface (two- dimensional) and airborne (three-dimensional) robots.
+Although there are many successful studies about the automated generation of test cases, determining whether a program has passed a given test remains largely manual. Langdonetal.[16]proposedtheuseofsearch-basedlearning from existing open-source test suites to automatically gener- ate partially correct test oracles. They argued that mutation testing, n-version computing, and machine learning could be combined to allow automated output checking to catch up with progress on automated input generation.
+AI software testing differs from AI-based software testing in diverse views such as test objectives, test focuses, test scope, test coverage as well as test techniques and tools. For example, AI-based testing primarily aims to increase
+efciency for a test process, reduce testing costs by reduce human operations, and increase bug detection effectiveness and speed. AI testing aims to provide on-demand testing services for AI software to support software validation and qualityengineeringprocess.AI-basedtestingmajorlyfocuses on test selection, automatic test execution, bug detection and prediction based large-scale testing history data and AI tech- niques. In addition, AI testing needs innovative continuous, timeliness, and currency testing techniques.
+D. AI-BASED MACHINE TESTING
+AI-based machine learning requires a huge number of inputs as the knowledge and different intelligent algorithms in order to make the right decision. By looking at an example using technologyinunmannedvehicles,therewillbeabasicunder- standing of how machine learning or machine intelligence work. The development of machine intelligence is still far from mimicking the cognitive competence of the human brain. It is still challenging to deal with those data effectively and making a driving decision accurately and quickly [17]. Machine learning sometimes returns an inaccurate prediction basedonthecollectionoftrainingdataandanengineerneeds tomakesomeadjustmentstoavoidsignicantlossesinterms of public safety.
+DeepLearningisdesignedtocontinuallyanalyzedatawith a logic structure as mimicking how a human can draw a conclusion. The deep learning needs a huge number of data sets to use input in the algorithms in order to result in a more accurate prediction. For instance, Google's AlphaGo, a sharp intellect and intuition game, learns by itself with- out predened data. It makes a more specic move and becomes the greatest player of all. Deep Learning denes a new paradigm based on data-driven programming. Since Machine Intelligence or Deep Learning depends on the train- ing data, the accuracy and quality of data play a vital role for public safety using machine learning in autonomous vehicles.
+Many kinds of research attempt to nd solutions for the current obstacles of Machine Learning Systems. To draw optimal decision making, approaches such as Fault Tree Analysis, Fuzzy Logic, Metaheuristic Algorithm, and Arti- cial Neural Network are developed to test with a huge amount of training data by using different algorithms. How- ever,thesufciencyandversatilityofDeepLearningsystems are based on the accuracy of the test data set. It is dif- cult to provide adequate support due to the accessibility of test data quality issue. The current Deep Learning systems have various vulnerabilities and their system analysis and defect detection are extremely difcult. Unlike traditional software systems, Machine Intelligence does not have a clear controllable logic and understandability since the process to make decisions rely on the training data. The recent study shows two major vulnerabilities in Deep Learning systems: Software quality from the output of Deep Learning alone is notadequate;andFailureinunseenattackseventhoughDeep Learning is immune to known types of attacks [18], [19].
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+120171
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+Thus, how to make machine intelligent testable is a great challenge for future AI-based machine testing.
+E. TYPICAL VALIDATION APPROACHES FOR AI SOFTWARE AI software testing could be performed using the following approaches from different perspectives.
+- Classication-based AI software testing, in which classication models for test inputs, contexts, and out- puts and events are set up to ensure the adequate test- ing coverage of diverse input data classes, classied contexts and conditions, and corresponding outputs and classes [20][24].
+- Model-based AI software testing, in which selected intelligentlearningmodelsanddatamodelsareextended to be traceable and testable AI test models to facilitate AIsoftware testingand operationsin qualityassessment of training data and test data.
+- Metamorphic (Non-Oracle) testing, in which a property-based software testing technique is used as an effective approach for addressing the test oracle problem and test case generation problem [25][28]. The key element of metamorphic testing (MT) is a set of Metamorphic Relations (MRs), which are necessary features of the target function or algorithm in relation to multiple inputs and their expected outputs.
+- Learning-based AI software testing using the crowd- sourced approach, in which selected machine learn- ing models and approaches are used to learn from crowd-sources testers in a service platform [30].
+- Rule-based AI software testing, in which pre-dened expert-based rules are established and used in AI test generation and validation [32], [34].
+Nevertheless, how to utilize the existing traditional or intel- ligent approaches to AI software testing is still a great chal- lenge currently.
+F. DATA QUALITY VALIDATION FOR AI-BASED SOFTWARE In recent years, data (such as image and video image) qual- ity assessment has attracted signicant attention. Besides, thequalityofbigimage/videodatasetswithlabeledalsohave an important impact on machine learning algorithms, such as deep learning. Using a deep learning approach to train articialAIprogramsbased onannotatedtrainingdatasetsis
+a popular way to develop intelligent software using a super- vised learning approach. With the increasing installation of video cameras in many cities, image data quality assessment is becoming a very hot research topic in computer vision and smart cities.
+Thereareanumberofcausesaffectingthequalityofimage data [48], [49], such as sharpness, noise, tone reproduc- tion, contrast, distortion, etc. Thus, the typical image quality factors are listed as accuracy, accessibility, readability and understandability, consistency [44], etc.
+According to the recent 2018 IEEE NAVIDA AI City challenge[33],manuallygeneratingannotateddatasetsbased
+on image datasets from city street transportation cameras bring diverse data quality issues in a deep learning process. Their case study result clearly indicates that the accuracy and quality of derived AI city transportation programs using a deep learning approach highly depends on the quality of annotated training data sets. Based on their experience report, all of the challenge teams encountered diverse data quality issues in annotated training datasets. And they also discovered the urgent needs in quality validation models, methods, and automatic tools for annotated datasets although there are numerous data validation tools for structure data. Therefore, the key issues of quality assurance for big data applicationsarehowtovalidateunstructureddataqualityand how to validate system quality in terms of various quality factors.
+Data quality validation and services in a deep learning processforAIsoftwarehasthreedimensions.Theyareshown as follows.
+- Raw data quality checking, which refers to the quality checking process and activities for collected raw data, such as camera-generated images, and videos. The pri- mary objective is to perform raw data cleaning, quality monitoring, and evaluation to ensure high-quality raw data could be collected.
+- Training data quality validation, which refers to qual- ity validation processes and activities for manually or semi-automatically generated training data sets, such as annotated data sets. Its objective is to improve the generation of training data quality in a deep learning processtoincreasethetrainingqualityforanunderlying AI software. The typical concerns include: a) training data scope and coverage, b) training data classication,
+c) training data quality, and d) training data coverage.
+- Test data quality evaluation, which refers to test data quality evaluation based on the validation results of a targeted domain-specic application. For a machine learning application system, the major focus of this task should be facilitating AI system quality problem detection,defectimprovement,trainingqualitycoverage and domain-based knowledge modeling issues for AI systems.
+IV. TESTING QUALITY ASSESSMENT AND ADEQUACY ANALYSIS
+A. TESTING QUALITY PARAMETERS AND QUALITY ASSESSMENT FOR AI SOFTWARE
+Like conventional software quality testing, quality parame- ters such as performance, robustness, security, etc., can be applicable to AI software and applications. In addition to the system quality parameters, we must pay attention to specic quality parameters for AI software functions and features. Samplequalityparametersforimagerecognitionsoftwareare presented as follows.
+- Correctness This quality factor reects if the recogni- tion result is true when faced with Boolean recognition
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+
+FIGURE 3. AI software test quality assessment.
+items,suchasgender,buyornot,recommendornot,age group, etc.
+- Accuracy This reects the accuracy of the recognition result when faced with numerical recognition items, such as age, gender, and color. Different math index can be used to measure it, such as mean difference, variance, standard deviation, distribution interval, con- dence level, absolute mean or relative mean.
+- SystemStability Thisreectsthestabilityoftherecog- nitionsystems.Forexample,torecognizethesamething twice or more times, the result should be stable.
+- Timeliness This reects some indicators related to time, such as the recognition time, training time, and classify time.
+- Recognition Ratio This reects the recognition ratio oftheimagesystem,suchastheperfectrecognitionratio which means the system recognizes the picture well, or recognition ratio which is divided by absolute mean or relative mean.
+- System Robustness This parameter indicates the robustnessofthesystem.Forexample,whenperforming special operations on the recognized picture, we need to check whether the system can still recognize it well. The transformation includes overturning, mirror image, enlarging or shrinking, shearing, shear, gray scale, and changing the dpi.
+- Image Quality This checks whether the recogni- tion systems can deal with the changing of the quality attribute of image, such as gauss noise, spiced salt noise due to the unreliable network transmission, etc.
+Based on the discussed quality parameters above, testing resultsareanalyzedandevaluatedforqualityassessment.For example, there are ve quality factors in the set (QF) here as shown in Fig. 3. As we mentioned, AI software have a number of features (F1,...,Fn), composed of corresponding sub-features(F-s1,..., F-si,..., F-sm). For each measurable feature, we could perform test complexity (TC) analysis. In addition, the quality factors can be measured in terms of pre-dened quality metrics to show their percentage value. Quality Measurement results can be represented using a Radar Chart shown in the left part of Fig. 3. Nevertheless,
+those measurement results need to be validated in practice to indicate their effectiveness.
+B. AI SOFTWARE TEST ADEQUACY AND COVERAGE When AI software can be operated under different contexts andenvironments,itmustbevalidatedunderdiverseenviron- ments to achieve certain context test criteria for vendors and customers.Thus,engineersneedwell-denedtestcriteriaand an effective test coverage analysis solution. As we discussed in Section II, diverse test models can be constructed and utilized for test coverage analysis. For a knowledge model, AI knowledge test coverage analysis need to be performed; for a feature model, AI features, sub-features, and feature classication need to be analyzed for test coverage; and for a data-based model, data classication, data relation, data format,datarange,etc.,needtobeaddressedfortestcoverage analysis.
+V. CASE STUDIES- QUALITY VALIDATION FOR ROBUSTNESS OF AN IMAGE RECOGNITION APPLICATION We performed case studies to indicate the feasibility and effectiveness of the proposed quality validation approach provided in this paper. Here we selected a face recognition system as the study object. We performed a case study on a realistic AI application system- ``Alibaba Cloud Computing Services Facial Age Recognition API'' provided by Alibaba Companyusingthemetamorphictestingmethod.Thebase64 encoding of images is submitted to APIs, and the system returns with the recognition results. The experiment data sets are selected from the wiki_crop.tar in the open face dataset IMDB-WIKI. There are total of 52444 face data, and 10K images are selected randomly as experimental data sets.
+A. QUALITY VALIDATION METHOD DESIGN
+The designed quality validation method is based on the robustness of the age recognition system: The recognition result is deemed better when the real age and recognition age are closer to each other. Facial age recognition is a commonly-used AI application using diverse machine learn- ing algorithms and pattern recognition strategies. There are existing non-oracle problems and due to the effect of picture quality (such as clarity, lighting, background, and expres- sion), network or other reasons, the robustness of an age recognition system is a basic quality factor in quality assur- ance. Thereby we need to test the robustness of the system. Based on the understanding of facial age recognition system above, we adopt metamorphic testing to validate the quality of the system. We consider the possible situations that may occur in a recognition process, such as image rotation, trans- lation, landscaping, a watermark of a picture, or the distance between face and camera.
+In this study, we dened two major metamorphic relations MR1 and MR2. For each metamorphic relation, we dene several sub-relations. For instance, in MR1, we give two sub-relations MR1-1 and MR1-2, i.e., a) recognized age is
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+120173
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+TABLE 2. Metamorphic relation case partition.
+
+
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+VOLUME 7, 2019
+ C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices
+stable under the spherical transformation (mirror), and b) recognized age is stable under image rotation. In the study, we veried if the image system under testing satises the dened MRs. The detailed metamorphic relations and their sub-cases are shown in Table 2. The proposed metamorphic relations are illustrated as follows.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+VOLUME 7, 2019
diff --git a/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt b/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt
new file mode 100644
index 0000000..3c8dfa9
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt
@@ -0,0 +1,88 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2019 IEEE International Conference on Big Data (Big Data)
+Industrial track: Architecting railway KPIs data processing with Big Data technologies
+Alexander Suleykin Peter Panfilov Natalya Bakhtadze
+V. A. Trapeznikov Institute of Control School of Business Informatics V. A. Trapeznikov Institute of Control Sciences, National Research University – Higher Sciences,
+Russian Academy of Sciences School of Economics Russian Academy of Sciences; Moscow, Russia Moscow, Russia Bauman Moscow State Technical
+aless.sull@mail.ru ppanfilov@hse.ru University
+Moscow, Russia sung7@yandex.ru
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract — in our conducted research we have built the data processing pipeline for storing railway KPIs data based on Big Data open-source technologies – Apache Hadoop, Kafka, Kafka HDFS Connector, Spark, Airflow and PostgreSQL. Created methodology for data load testing allowed to iteratively perform data load tests with increased data size and evaluate needed cluster software and hardware resources and, finally, detected bottlenecks of solution. As a result of the research we proposed architecture for data processing and storage, gave recommendations on data pipeline optimization. In addition, we calculated approximate cluster machines sizing for current dataset volume for data processing and storage services.
+Keywords — Big Data technologies, distributed data processing, Hadoop, Spark, railway KPIs.
+I. INTRODUCTION
+Nowadays the open-source solutions are becoming more and more popular and Hadoop stack with its already improved Map Reduce data processing engine is one of the most widely used technologies for big data storage. Based on Hortonworks Data Platform stack, it delivers 100% open-source global data management platforms and services so customers can manage the full lifecycle of their data. This stack is widely accepted by many large companies for data processing, storage, analysis and visualization.
+At the same time, the complexity of big data processing and analysis is extremely increasing due to data volume growth, data variety, velocity, different data formats of data transmission, integration problems and other data complexities. At this point there is always a difficult task to build a robust, reliable and fault-tolerant data processing and storage framework that could handle big data of various formats and high volume from different data sources and systems. The current research is devoted to the application of big data technologies based on HDP Hadoop stack and its ecosystem to the building of data processing and storage platform for railway roads KPIs.
+Performed case study has revealed the applicability of regarded technologies to the building of full data pipeline for data processing and storage for railway KPIs. Selected technologies are Apache Hadoop, YARN, Apache Kafka, Confluent Kafka Connector, Airflow, Apache Spark, PostgreSQL.
+The conducted research generated the synthetic load tests based on datasets of real KPI data from one railway company with initial data load and X1, X2, X4, X8 increments on top of initial load. Load tests have shown the software and hardware bottlenecks for regarded datasets KPIs. The result of the work is formulation of bottlenecks of data processing pipeline, recommendations for optimization of pipeline and architectural sizing of machines and used Big Data services for current dataset of railway KPIs data storage and processing.
+In this paper, the authors have discussed the railway KPIs from railway transportation operations and data-driven distributed computing perspective. Here, after introduction in section 1, the related works on concepts and requirements of KPI frameworks are discussed in section 2. The way to successful implementation of the distributed computing architecture for the railway KPI framework is described in section 3 with architectural layers detailed description in section 4 and dataset examples from railway industry in section 5, followed by experiments with proposed architecture and test results in sections 6 and 7. Discussions on optimization recommendations and conclusions conclude the paper.
+II. RELATED WORK
+Key performance indicator (KPI) is a collection of performance measures that an organization or company uses to monitor its performance over time. KPIs are used to determine a progress in achieving strategic and operational goals of a company, and to compare its performance with others within its industrial sector. Setting KPIs requires smart decision on how many indicators to track to determine the success of business. More over, the relevance of the KPIs must be continuously evaluated to ensure their alignment with priorities in business strategy and operations. Industry-specific KPIs have been created in different markets including retail, healthcare, financial services, logistics, manufacturing and supply chain operations, and transportation.
+The increasing railway traffic and a corresponding need of railway capacity require a more efficient operation, maintenance and railway asset management by infrastructure managers (IMs). To support railway IMs in decision making process, KPIs are developed so that the results of operation and maintenance activities could be measured and monitored.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+978-1-7281-0858-2/19/$31.00 © 2019 IEEE
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 978-1-7281-0858-2/19/$31.00 ©2019 IEEE 2047
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+In literature, one can find examples of projects on KPIs and benchmarking for railway transport operations and railway infrastructure maintenance [1-7].
+However, KPIs used in railway transportation sector are often ad hoc and seldom standardized. In the course of last decade, several programs were undertaken both at national and international levels to bring a common ground to a multiple efforts in developing KPI platforms for managing railway infrastructure.
+In Europe, an increased interoperability and building of a trans-European railway network is one of the goals of the European Union. The required harmonization and standardization of the management of railways have led to increased use of European Standards such as, for example, the European standard; Maintenance key performance indicators (KPIs), EN 15341 [8]. In the paper [9], the authors have proposed performance indicators for railway infrastructure, that have been mapped and compared with indicators of this European standard.
+In 2013, a Platform of Rail Infrastructure Managers in Europe (PRIME) was established to assist in implementation of the Single European Rail Area, better deployment of European Rail Traffic Management System (ERTMS), performance benchmarking and exchange of best practice amongst infrastructure managers. PRIME organization plays the role of the European Network of Infrastructure Managers as foreseen in Article 7f of Directive 2012/34/EU establishing a single European railway area, as amended by Directive (EU) 2016/2370. Among the major tasks of the Network there is a task under paragraph (d) “monitor and benchmark performance, including identification of common principles and practices for the monitoring and benchmarking of performance in a consistent manner”, which is carried out by the KPI's and Benchmarking Expert SubGroup. The subgroup is preparing yearly benchmarking reports, including the most recent PRIME KPI Catalogue [10], which contains the indicators agreed by the expert group and their definitions, set out in a structured and prioritised way following the concept of the balanced scorecard. The KPIs have been developed over a three year period and tested in 3 pilot exercises. These KPIs will be fixed for use in the initial Dashboard tool, but it is expected that they will be developed further and improved on a regular basis in the future.
+A new challenges that railway KPI implementations might face are associated with the introduction of the international ISO 55000 standard [11] focused on asset management. The ISO 55000 series standard makes asset performance evaluation (APE) an important aspect of the asset management system (ASM) as per international standard ISO 55001:2014 [12]. The ISO 55000 series standard sets the asset management principles for organizations to follow when developing and implementing all of their functions including units and processes. The APE serves to improve the level of the company's assets to achieve the objectives. The asset performance measurement and management (APMM) is a recognized best practice for preparing a strategic road map from top strategic managerial level to the operational level
+through a link and effect model [13] for identifying and developing KPIs.
+A high level description of the elements of APMM concept can be found in [14], followed by a comprehensive discussion on specific issues and challenges of APMM. Among them, an important new data-driven challenge is ”to define and develop methods for right data collection through condition monitoring and big data management, beside management of knowledge” [14].
+Nowadays, Smart Monitoring and Smart Maintenance (eMaintenance) concepts based on distributed data processing and Big Data platforms are applied for real-time data collection, storage, analysis and decision support. From business objectives prospective, it is important that data collected are linked with KPIs so that they can be analyzed to compare and measure with business strategy and organization. Depending on the business requirements, the KPIs and other indicators can be used for generating composite indicators (CI) [15] for performance benchmarking with the best in the industry, besides verifying the return on investment. Stenström et al, in [15], developed a link and effect model for monitoring and analysis of operation and maintenance performance of rail infrastructure and demonstrated as a case study.
+Data collected from smart monitoring systems in commercial and industrial setups are growing rapidly to be very large in volume, high speed in velocity and vast in variety for the data acquisition, storage, processing and analysis. Big data technologies are used for information extraction through pattern recognition and eMaintenance solutions [16, 17]. While the data collection, data quality, processing and analysis for the asset performance under Big Data analytics has taken focal point, performance measures, indicators and key performance indicators (KPIs) dictates which data is needed to be measured and why [18].
+Big Data analytics provides IMs faster and better decisions that were inaccessible before. Nowadays, most companies use business analytics and data-driven reporting tools to automatically track its KPIs. The modern Big Data and distributed computing solutions help companies to collect relevant data from operational systems and create reports on the measured performance levels. Company's executives and managers are obtaining KPI results on business intelligence dashboards or performance scorecards that include diverse linked data visualizations, with the ability to improve understanding of the company's performance data.
+To guarantee the business success, KPIs and various issues and challenges of APMM should be considered thorougly. In this paper, we have touched the data-driven challenges of the KPI and APMM frameworks on the basis of our experience in architecting smart monitoring and management systems for mobile network industrial sector [19]. Here we have demonstrated how our expertise in distributed computing and smart data processing can be applied to somewhat similar problem area of railway asset performance monitoring and measuring for establishing railway KPI framework.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 2048
+
+III. CORE ARCHITECTURAL COMPONENTS OVERVIEW Integration Layer Storage Layer Serving Layer
+We propose to use Lambda architecture as a basement
+architectural methodology. Thus, it allows companies to
+handle their data in the most reliable and effective manner for
+majority of use cases. In our previous work [19] we built
+Smart Cellular network monitoring service using Big Data
+methods and tools on top of Lambda-driven architecture. The
+following picture depicts the key Lambda principles:
+Fig. 2. Research data pipeline architectural overview
+The definition of used components is according to the table below (Table 2):
+TABLE II. CORE COMPONENTS DEFINITION
+
+
Component
Definition
1
JBoss Fuse
Industrial data bus for solving the integration problems of the entire company [21]
2
Kafka
Distributed, fault tolerant, horizontally scalable, productive message broker [22]
3
HDFS
Distributed fault tolerant file system optimized for storage for processing large amounts of data [23]
4
Spark
Distributed in-memory framework for high-load data processing [24]
5
PostgreSQL
Relational database to provide BI data to tools [25]
6
AirFlow
Universal Scheduler [26]
Fig. 1. Lambda architecture overview
+It’s widely assumed to highlight the following layers (Table 1):
+TABLE I. ARCHITECTURAL COMPONENTS OVERVIEW
+
+
Component
Purpose
1
New data
New data sources
2
Batch layer
A layer of a full data set optimized for batch calculations. The role model is applied only at the level of subject areas (directories) and storing objects
3
Serving layer
Provides fast (including random) access to structured data for consumers. Data should already be all designed for Batch Layer. A role model is applied with the possibility of limitation to objects (tables), attributes / indicators (columns) and rows
4
Speed layer
Speed layer Designed for streaming data processing and providing access to the most relevant data, i.e. data that has not yet been recounted by the Batch Layer, but has already appeared in the system. The Speed Layer looks only at recent data without access to history, while the Batch Layer looks at the entire data history. Not all indicators can be calculated on this layer
5
Query
Queries from external BI systems
Data transfer from Kafka to HDFS is implemented using Confluent open source solution – Kafka HDFS Sink Connector [9].
+IV. ARCHITECTURAL LAYERS DESCRIPTION AND DEFINITION
+In our research Storage Layer and Serving Layer have their own Layers (sublayers), which are used for methodological correctness of data load. The data pipeline of the whole data movement is strict and should go through the following sublayers inside Serving and Storage Layers:
+Data Storage Layer Serving Layer
+As a Lambda-based driven architecture we have used the following architectural components in our research (fig. 2):
+Fig. 3. The Workflow data pipeline and layers interconnection
+The next table shows the definition and description of each used sublayer:
+TABLE III. DESCRIPTION AND DEFINITION OF SELECTED SUBLAYERS
+Detail Data Store
DDS
Postgre
The layer of the current data slice presented in a relational form.
Re-keying (generation of internal storage IDs). Conversion from object to relational storage. Normalizati on of data (if necessary). Creating a single data model (without unification) Storing a current data slice
Data Mart
DM
Postgre
Groups showcases by a specific attribute, most often the subject area.
+Contains unified detailed data.
+It contains calculated indicators for use in reporting.
+Calculation of indicators used in several reports is necessarily submitted to this layer.
Data unification. Denormaliza tion of data. Data Aggregation. Calculation of derived indicators used in several places.
Report Layer
REP
Postgre
The final reporting layer. From it, data are used only for display in BI tools. It is forbidden to build some reports on the basis of others. Only with the transfer of the information used in the DM layer. Calculation of indicators specific to specific reporting.
+It can be both logical and physical.
Calculation of derived indicators specific to a particular report.
Export Layer
EXP
Postgre
For each data consumer, a scheme is created in which objects are placed for load. The circuit performs almost the same functions as REP
Name
Abbr eviati on
Location
Definition and functions
Transforma tions
Staging Buffer Area
STG/ BUF
HDFS
The area of temporary data accumulation in the format corresponding to the source without any transformations.
+Streaming data comes from sources.
No
Staging Exchange Area
STG/ EXC H
HDFS
The intermediate region for forming the next ETL processing packet.
+All accumulated data are moved from the buffer to form a data processing packet.
+It is assigned a unique BATCH_ID.
BATCH_ID
StagingA rchive Zone
STG/ ARC H
HDFS
Storage of the complete archive of incoming messages without transformation of the storage format.
+Incoming messages are archived after successful processing.
Archiving and enlarging storage files.
Operatio nal Data Store
ODS/ HIST
HDFS
The area in which the source data scheme is stored, but they are reduced to a single binary form of storage. It contains the entire history of changes and deletions.
Convert to binary storage format. Conversion from object to relational storage.
Batch View
ODS/ BW
HDFS
It contains only an actual slice of the state of objects without a change history and deleted records.
Calculation of the actual data slice.
Detail Data Store Staging
DDS_ STG
Postgre
Batch layer. A separate instance is created for each source system. One-to-one data is transferred from HDP and stored only between downloads. Both full data load and only line changes (deltas) can come.
Detail Data Store Logic
DDS_ LGC
Postgre
Layer of transformation logic. Contains data transformation procedures before writing to DDS.
V. RAILWAYS KPIS DATA DESCRIPTION
+The conducted research has been performed using Key Performance Indicators (KPIs) data from one railway company. The data are represented by usual star schema which means that there is one fact table (main table with events – KPIs) and others are dictionaries. The data are corresponded to the 3-rd level of normal form.
+The entities description and data types are the following (Table 4):
+TABLE IV. RAILWAY KPI DATA DESCRIPTION AND IT TYPES
+
+Entity
Attribute
Data type
Description
DATA_T YPE
ID
INTEGER
Dictionary – type of data for KPI. Can be approved or planned
NAME
CHAR
DATE_T
ID
INTEGER
Dictionary – type of date
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 2052
diff --git a/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt b/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt
new file mode 100644
index 0000000..a41cb30
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt
@@ -0,0 +1,178 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+
+See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/337256634
+Foundations of Data Quality Assurance for IoT-based Smart Applications
+Conference Paper · November 2019
+DOI: 10.1109/LATINCOM48065.2019.8937930
+CITATIONS READS
+11 332
+4 authors:
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Rodrigo Togneri
+Escola de Administração de Empresas de São Paulo da Fundação Getulio Vargas 6 PUBLICATIONS 96 CITATIONS
+SEE PROFILE
+Juha-Pekka Soininen
+VTT Technical Research Centre of Finland 108 PUBLICATIONS 3,160 CITATIONS
+SEE PROFILE
+Gláuber Camponogara University of São Paulo
+12 PUBLICATIONS 182 CITATIONS
+SEE PROFILE
+Carlos Alberto Kamienski Universidade Federal do ABC (UFABC)
+218 PUBLICATIONS 2,215 CITATIONS
+SEE PROFILE
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+All content following this page was uploaded by Carlos Alberto Kamienski on 15 February 2020.
+The user has requested enhancement of the downloaded file.
+Foundations of Data Quality Assurance
+for IoT-based Smart Applications
+Rodrigo Togneri
+, Glauber Camponogara http://swamp-project.org/ 5 Antifragility is a property of systems that increase in capability to thrive as a
+, Juha-Pekka Soininen https://agrosmart.com.br/en/ result of stressors, shocks, volatility, noise, mistakes, faults, attacks, or failures
+, Carlos Kamienski1
+rodrigo.togneri@ufabc.edu.br, glauber@agrosmart.com.br, juha-pekka.soininen@vtt.fi, cak@ufabc.edu.br 1Federal University of ABC, Santo André / Brazil
+2Agrosmart, Campinas / Brazil
+3VTT Technical Research Centre of Finland, Oulu / Finland
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract — Most current scientific and industrial efforts in IoT are geared towards building integrated platforms to finally realize its potential in commercial scale applications. The IoT and Big Data contemporary context brings a number of challenges, such as providing quality assurance (defined by availability and veracity) for sensor data. Traditional signal processing approaches are no longer sufficient, requiring combined approaches in both architectural and analytical layers. This paper proposes a discussion on the adequate foundations of a new general approach aimed at increasing robustness and antifragility of IoT-based smart applications. In addition, it shows results of preliminary experiments with real data in the context of precision irrigation using multivariate methods to identify relevant situations, such as sensor failures and the mismatch of contextual sensor information due to different spatial granularities capture. Our results provide initial indications of the adequacy of the proposed framework.
+Index Terms— Data quality, internet of things, smart applications, precision irrigation.
+I. INTRODUCTION
+Nowadays, the Internet of Things (IoT) is increasingly leaving the state of an idea and landing its technology in its first practical projects worldwide. Proof of this evolution is the recent emergence of a series of research and commercial initiatives in the development of complete technological platforms that integrate IoT to the applications. Only in precision agriculture, IOF20201 and SWAMP2 [1], and Agrosmart3 and Agricolus https://www.agricolus.com/ [7].
+ are important scientific and commercial initiatives, respectively. The technical and application challenges are enormous since these platforms enable complex real-time control systems that combine the use of communication infrastructure, hardware, software, analytical techniques and application knowledge combined into multiple layers.
+Within the context of current challenges, this paper addresses the fundamental issue of input data quality. In any IoT-based smart application, the output is highly dependent on the data captured by field sensors. Dealing with the lack of data availability and veracity can be synthetized by the acronym GIGO (Garbage-In, Garbage-Out). In other words, however
+sophisticated smart application models and algorithms are, poor quality input data will result in poor recommendations.
+The solution to this challenge is to increase the smart application data sensing robustness and antifragility 5. The
+straightforward benefit is that robust and antifragile sensing allows the system analytical core input data to be as good as possible. As a result, more reliable decisions are made, generating real value gains for applications and thus helping to maximize the end-user confidence in new technologies.
+Within the strategic objective of realizing the benefits of this general solution, this paper brings two main contributions:
+• The Foundations for a Data Quality Assurance Framework, as a new general vision to increase robustness and antifragility of sensing. Through the composition of complementary approaches, both traditional and cutting- edge ones, the proposed vision is of general use in IoT- based smart applications, although examples here represent the context of precision irrigation.
+• Preliminary Findings with Real Precision Irrigation IoT Data that corroborate with the data quality assurance vision. Preliminary experiments were undertaken using raw sensor data provided by our partner Agrosmart, which raised some initial interesting insights in the automatic identification of data quality problems, diagnosis and treatment. For example, the use of multivariate methods has helped us to identify specific sensor failures and the mismatch of contextual sensor information due to different spatial granularities capture. These results corroborate to part of the proposed vision, particularly related to the anomaly multivariate techniques to process IoT data from multiple sources as a way to implicitly aggregate the application context.
+In the remainder of this paper, Section II brings related work, Section III explains the foundations of the proposed data quality assurance vision, Section IV develops preliminary experiments with real data, Section V presents and discusses the key results of the preliminary experiments, and finally Section VI draws some conclusions.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+1 https://www.iof2020.eu/
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+II. RELATED WORK
+Karkouch et al. presented an overview of the main approaches to data quality in IoT, and the main contributions were the proposition of data quality dimensions and its categories, the systematic analysis of problems and the suggestion of techniques for the treatment thereof [2]. Our work complements it introducing the antifragility concept, valuing multivariate analytical techniques as links between data and its semantics in the application context, and considering also the influence of IoT architecture on data quality.
+Banerjee and Shet realized the importance of addressing the data quality problem in architectural and analytical layers, although kept the discussion at a higher level [3]. Our work completes that discussion by introducing more practical elements towards IoT platforms. Dou and Nan worked specifically on the architectural question seeking to determine the optimization of sensor distribution layout and connectivity [4], although without fitting it into the broader context of data quality.
+Liu et al. discussed data veracity problems and solutions, while this paper seeks to integrate data availability and veracity issues in a single approach [5].
+Sanyal and Zhang presented a compelling solution to the IoT data veracity issue through unsupervised estimation methods that replaced low statistical confidence data [6]. Our work complements it by providing a more sophisticated anomaly detection and classification approach that do not make use of estimation methods, providing a more reliable dataset (without disregarding anomalous but dependable data points – disregarded by estimation methods [7]). Vilenski et al. proposed to use multivariate techniques in detecting anomalies in agriculture [9]. Our work goes further proposing a more generalist approach, although our practical experiments are also in agriculture.
+OGC http://www.opengeospatial.org
+ (Open Geospatial Consortium) developed open standards for IoT applications, providing two standards suitable for data quality solutions, namely UncertML (Uncertainty Markup Language) and QualityML (Quality Markup Language). This work is in accordance with these standards and intends to contribute with them when the vision proposed here is deployed as a functional framework.
+III. FOUNDATIONS OF DATA QUALITY ASSURANCE
+A. Data Quality Issues: Availability and Veracity
+Data availability and veracity are key issues in IoT operations. The former is straightforward, i.e., if there are no stimuli coming from sensors, there is no reaction. And, the latter because if the sensor stimuli are relevantly inaccurate, the reactions may be inappropriate or even harmful. We want to maximize data availability, and within available data, we want to maximize their veracity.
+Possible types of IoT data quality issues can be divided into availability and veracity problems. Data availability problems include:
+• Error Data: Occurs when the sensors data capture system identifies a known problem, emitting a specific signal to it. The data is clearly invalid, and as it is easily identifiable, it must be converted into missing values. As a result, data becomes unavailable.
+• Data Interruption: Occurs when a sensor data does not reach its reader. Regardless of the cause, data also becomes unavailable.
+Data veracity problems include:
+• Unbalanced Data: Occurs when sensor data is emitted and captured, but this data is not reliable to the measured phenomenon. Data is available but is not dependable.
+• Non-Correspondence of Different Granularity Data: Occurs when there are valid sensor data, although there is a mismatch between different sources due to different space or time granularities of the sensing system.
+B. Increasing Sensing Robustness and Antifragility
+Data quality assurance can be achieved by acting on both architectural and analytical layers [3]. Fig. 1 shows the big picture of how these layers are placed in an IoT-based smart applications data flow. The Data Quality Assurance Framework is the phase coming right before Information Processing, which is the system core analytical task.
+
+Fig. 1: Data Quality Assurance Framework as a Data Transforming / Influencing Agent Through IoT-Based Automated Systems Data Flow.
+Taleb [7] provided an important contribution to risk management by stating that robustness is not the opposite of fragility, introducing the concept of antifragility and making it easier for systems to be built to evolve with exposure to its environment. Since then, many engineering areas have been using advanced analytical techniques in the search for systems evolution [9] [10]. Taleb introduced a sensitivity scale of things to the environment instabilities (Fig. 2): at one extreme is the concept of fragility, in which things are harmed by instability; in an intermediate position is the concept of robustness, in which things are invariant to instability (do not harm or benefit); at the other extreme is the concept of antifragility, in which things benefit from instability and become better, i.e. things that increase in capability to thrive as a result of
+
+Fig. 2: Taleb Scale and Correspondence with Data Quality Assurance Effort Types (Architectural or Analytical).
+stressors, shocks, volatility, noise, mistakes, faults, attacks, or failures [7].
+As environment instabilities usually bring new and unknown circumstances that cannot be managed by supervised machine learning [7], the antifragility vision states that these techniques should be underprivileged in relation to unsupervised and reinforcement machine learning, which are more adequate to really learn the unknown. Consequently, this is our first suggestion for an IoT data quality assurance framework.
+Thus, between the two data quality assurance layers, although the architectural plays an important role, the one that has the greatest potential to flexibilize towards antifragility is the analytical, because it can evolve action rules over time by means of experiencing the data (machine learning). The more data and the more instabilities, the more the system learns and improves.
+1) Analytical Layer Approaches
+In the past, sensors were preferably subject of electric and electronic engineering, due to their use in equipment of highly specific and local applications. Data treatment was fully performed by signal processing techniques based on mathematical filters for eliminating noise, and keeping only the signal (relevant data) of individual sensors.
+On the other hand, in the current IoT and Big Data era, data is becoming more complex and is directly linked to its meanings in smart applications: many dimensions, of different types, with nontrivial relationships among each other - nonlinearities, lag effects - and used in decisions in social environments or others of equal sensitivity. For example, in precision irrigation, a series of meteorological, soil moisture and crop growing stage data can be collected as input to water need estimation, and the relationship among these variables can be considered of high-complexity [11]. Asymmetries of soil moisture behavior also occur as their value, soil depths and the time varies. There is still a data type variety: while most data are series of quantitative variables, others of great relevance as georeferenced images are of semi or non-structured nature, mixing quantitative and qualitative values.
+The complexity is not only in the nature of data but also from the data collection architecture, since sensors are sparsely spread on the space (they often have geo-referential characterization), have different periodicities and deal with fault tolerance concepts.
+Thus, the traditional signal processing approach is no longer sufficient, requiring an evolution that here we call Signal Processing 2.0, which is an IoT adaptable data flow
+based on multivariate unsupervised and reinforcement machine learning techniques. In this context, the analytical layer of our data quality assurance vision aims at bridging this gap. Further, the current scenario requires data treatment to be the target of the most powerful arsenal of machine learning techniques.
+ Fig. 3 synthetizes the data treatment flow in the analytical layer at a higher level. Also, flows differ depending on the type of data problems. The four steps of the analytical layer are:
+Fig. 3: Macro-flow of Data Quality Assurance in the Analytical-layer.
+a) Anomaly Detection
+Data veracity problems cannot be easily identified because data belong to the expected domain range, and for this reason it is customary to use data mining techniques [12]. In this sense, the techniques of anomaly detection [13] [14] propose to identify out of context values and sometimes classify it. In the traditional signal processing realm, univariate applications (a single signal) are more common. However, in the more modern context of IoT and Big Data, multivariate techniques, the ones that consider the relationship among multiple data sources, gained a lot of attention due to their ability to identify anomalies inaccessible to univariate techniques.
+b) Determining the Validity of Anomalous Values
+A data point being anomalous does not mean that it is also invalid. It may simply be caused by the occurrence of a rare but real event, which obviously must be regarded as a valid point. At this step, therefore, one must seek for: i) automatic separation of valid from invalid anomalous points, through comparison with theoretical or empirical models [15], or using anomaly detection techniques; and ii) in case of an invalid point, if possible, define which variables are the cause of the anomalous effect, for discarding only data from the offending variable). This step is difficult to replicate for different applications, as it relies on domain specific knowledge (i.e.
+theoretical or empirical models).
+c) Assigning Missing Values to Invalid Values
+Invalid values should not be used in analytical applications for preventing harmful results. This is the easiest step, and since the invalid values have already been identified, the only task here is to replace invalid by missing values.
+d) Data Reconstruction
+The previous step gives us a more reliable dataset. In this step, missing values are reconstructed from valid ones using
+different techniques such as estimation methods [16]. When time series anomaly detection techniques [13] are adequate, or when there were incomplete original cases (which were therefore not considered in some anomaly detection approach), the reconstructed data come back to the anomaly detection step.
+2) Architectural Layer Approaches
+The architectural layer, encompassing elements as diverse as hardware / software development and data capture and communication solutions, naturally has a myriad of possible approaches. Here we emphasize higher-level architectural aspects that are key to sensing robustness and antifragility.
+Fig. 4 synthetizes the influence map of the architectural layer in the system. It highlights the two main practical approaches: (a) use of sensors grid [17] and, (b) use of image-based sensors (drones, satellites) [18]. Both allow a lower granularity of physical space, potentiating contextual spatial knowledge, also impacting the analytical layer by using spatial statistics techniques, with positive consequences in the system antifragility.
+
+Fig. 4: Map of Influence of the Architectural Layer on the Analytical Layer of the Data Quality Assurance Framework.
+The use of sensors grid naturally brings an additional gain of robustness, because the sensors are physically distributed and a fault in one can be covered by a estimative from others nearby. Conversely, the gain in robustness is not natural in the use of image-based sensors, because sensors are concentrated in a single equipment (drone or satellite), and, in case of a failure, all the space points are lost simultaneously. This is known as SPOF (Single Point Of Failure) problem, which can be dealt with redundant equipment.
+IV. PRELIMINARY EXPERIMENTS WITH REAL DATA
+We performed preliminary experiments with real data from the precision irrigation domain, which provides evidence of the potential of using our vision for data quality assurance. Specifically, these experiments work within the scope of the anomaly detection step of the analytical layer and demonstrate the value of multivariate approaches.
+A. Agrosmart and the Dataset
+Agrosmart is a Brazilian company that provides crop intelligence services, using a proprietary IoT platform and application of advanced analytical techniques. It provided raw data for this study, from operations of five farms with soybeans crop for a period of approximately 2 years, starting in the first
+half of 2016 (depending on the beginning of each culture cycle) until the end of August 2018. Each farm has 1 to 5 management zones, the internal spatial components of a farm, divided usually by soil characteristics.
+This dataset contains sensor data, such as7: a) for the spatial granularity of the whole farm: air temperature ℃ , soil temperature (at 40 cm deep) ℃ , global solar radiation
+/ , air relative humidity [%], wind speed / , wind direction ° and atmospheric precipitation (rainfall) ; b) for the spatial granularity of the management zone (with a single sensor probe): soil water tension8 (at 20, 40 and 60 cm
+deep) , irrigation management , and, in some cases, atmospheric precipitation . The temporal granularity of the raw data ranges between 5 and 30 minutes, depending on the variable and the farm or management zone. Further details are omitted due to confidentiality issues.
+B. Approach
+When considering the anomaly detection step, the most important aspect is if multivariate approaches are useful to detect veracity problems. In order to simplify the results, only two variables are considered: atmospheric precipitation (farm) and soil water tension at 20 cm deep9 (management zone), aggregated by day. From the raw variables, we derived new ones, due to their semantics in the agriculture context:
+• Previous Soil Water Tension 20cm-deep : Soil water tension measured at 20 cm depth at the very beginning of the reference date (management zone).
+• 1-Day-Delta (Soil Water Tension 20cm-deep) : Variation value of soil water tension 20cm-deep at the reference date.
+• 1-Day-Precipitation : The total precipitation occurred at the reference date (farm).
+We used LOF (Local Outlier Factor algorithm) [19] [14], one of the most successful anomaly detection techniques for modern Big Data environments. LOF is a multidimensional anomaly detection technique based on KNN10 for computing spatial density and providing a real numerical value (of domain 0, ∞ ) for each data point: the closer to 1, the more a certain point is similar to its neighbors, indicating that this point belongs to a cluster of points sharing a common behavior. On the other hand, the more distant from 1, the more unusual is the behavior of that point, which becomes an anomaly candidate.
+For this experiment, data was cleaned from obviously invalid values (error data or domain outside values) And data was not reconstructed (i.e., data with missing values), as it is a simplified experiment. The presence of missing values makes that LOF is only applied in data points with non-missing values in all the considered variables.
+7 All measurements are taken as recommended by [16].
+8 Pressure that the plant needs to exert to consume soil water. 0 kPa indicates extreme ease and 200 kPa represents a severe condition to plant.
+9 At this depth the response to water intake is immediate.
+10 In KNN (K Nearest Neighbor) algorithm, we used K = 15, arbitrated in response to the parameter stability criterion established in [14].
+V. RESULTS AND DISCUSSION
+LOF generated approximately the same results for all management zones and farms, so that, without loss of generalization, only the results of one management zone of one farm is presented. Fig. 5 depicts the scatter plot of the 3 derived variables. Filled circles denote a behavior considered common by LOF Considered cut-off value: 4.
+, whereas points in other shapes represent anomalous behavior:
+• Red triangle: The soil is previously dry (close to 200 , sensor ceiling value), with no relevant precipitation, although an extreme jump of water availability is observed in the soil, which is highly unexpected.
+• Blue cross: Unusual soil drying jumps, when the expected behavior is a smoother drying process, even for days with no precipitation.
+• Purple star: Extreme cases of the blue crosses, where soil water availability is high (values close to 0 ), but the
+
+Fig. 5: Indication of Anomalous Points in the Data of One of the Management Zones and Farms - Scatter Plot Version.
+soil dried completely (values close to 200 ) in only one day, a highly unexpected phenomenon.
+Fig. 6 complements the analysis of Fig. 5 showing results in a timeline. We can see that red triangles are usually preceded by points with an opposite movement (purple stars and blue crosses), and between them we usually see points characterized by a yellow band, which are sequential points without any variation of values in the soil sensor (a time series anomaly behavior itself). By the domain knowledge, we know this pattern means soil sensor malfunction. However, we could infer that conclusion only by observing these rare events together (anomaly convergence). It is a clear example of how multivariate techniques and the convergence (in space or time) of multiple anomalies can identify real problems, and consequently differentiate them from rare but real phenomena. In other words, it is a way to use domain knowledge implicitly.
+The blue crosses are harder to have their veracity determined only by Fig. 5, since their behavior is not as extreme as the purple stars and red triangles. However, Fig. 6 highlights that when they have similar patterns, almost glued to a yellow band, it suggests that also indicate a failure. One time more, there is an anomaly convergence indicating a failure.
+Other challenging case is the last red triangle point at the end of January 2017, because it is within the acceptable range of the three variables. However, it is in a marginalized condition according to the joint behavior, something that only a multivariate technique can capture. This happens when there was no precipitation but a significant increase in soil water tension was observed. Such abnormal behavior may have occurred either by a sensor data distortion (precipitation may have occurred without being captured in data) or by non- correspondence of different granularity data (Section III-A). The latter is the most likely reason, since the soil data is from the management zone and the precipitation data is from the farm. Sensor problems are also less likely to have happened in this case because the sequential points are of common behavior (the red triangle in question is a single anomaly among common ones). Thus, this is an example where the non- correspondence of different granularity data can insert invalid
+data even though each sensor is emitting valid values.
+Also in Fig. 6, most highlighted anomalous points occur in the off-season period (crop interval time), which makes sense, since the sensors can be in preventive maintenance or even are not being monitored because they are not in use anyway. However, other anomalous points (such as the last red triangle point) occurred during the crop period, when usually expressive anomalies are less frequent, making the detection more difficult. In all cases, the anomaly detection experiment revealed interesting results, identifying both expressive and subtle anomalies, in both off-season and season periods. Even in a simple experiment with few variables and a single technique, it provided a preliminary validation of our data quality assurance framework vision, showing that future work is welcome to improve it.
+
+Fig. 6: Indication of Anomalous Points in the Data of One of the Management Zones and Farms – Time Series Version.
+VI. CONCLUSION
+In response to the gap in the IoT literature in data quality, this paper proposes a new data quality assurance framework vision as a new approach to address the key practical challenges imposed by the new IoT platforms in the context of Big Data.
+Real data of precision irrigation operations were used in preliminary experiments seeking to find some evidence of the adequacy of some of the key elements proposed in the framework. In this case it was the importance that unsupervised multivariate criteria, such as LOF, can play in the process, mainly helping to identify, validate and interpret anomalous values within the larger objective of guaranteeing data veracity. Most of the identified failures in the experiment were not identifiable by normal signal processing approaches, but only by the joint of multivariate criteria (anomalies were subtle, in multivariate context) and of the anomaly convergence phenomenon (in some cases, it even replaced specific domain knowledge need). We have observed that, in identifying valid and invalid anomalies, of expressive or more subtle detection, the experiments could be considered successful in encouraging new ones in a more complete version of the proposed vision, as a functional framework.
+A straightforward next step is to deepen the experiments and analysis with real data, by comparing several techniques of anomaly detection, veracity criteria and data reconstruction as
+well as the establishment of a feature engineering process for the capture of asymmetries and time effects among the variables.
+REFERENCES
+[1] C. Kamienski, J.-P. Soininen, M. Taumberger, R. Dantas, A. Toscano, T. Salmon Cinotti, R. F. Maia and A. Torre Neto, "Smart Water Management Platform: IoT-Based Precision Irrigation for Agriculture," Sensors 2019, vol. 19, p. 276, 2019.
+[2] A. Karkouch, H. Mousannif, H. Al Moatassime and T. Noel, "Data Quality in Internet of Things: A State-of-the-Art Survey," Journal of Network and Computer Applications, vol. 73, pp. 57-81, September 2016.
+[3] T. Banerjee and A. Shet, "IoT Quality Control for Data and Application Needs," IEEE Intelligent Systems, vol. 32, no. 2, April 2017.
+[4] R. Dou and G. Nan, "Optimizing Sensor Network Coverage and Regional Connectivity in Industrial IoT Systems," IEEE Systems Journal, vol. 11, no. 3, September 2017.
+[5] X. Liu, S. Tamminen, X. Su, P. Siirtola, J. Röning, J. Riekki, J. Kiljander and S. J.-P., "Enhancing Veracity of IoT Generated Big Data in Decision Making," IEEE International Conference on Pervasive Computing and Communications Workshops (PerCom Workshops), 2018.
+[6] S. Sanyal and P. Zhang, "Improving Quality of Data: IoT Data Aggregation Using Device to Device Communications," IEEE Access, vol. 6, November 2018.
+[7] N. N. Taleb, Antifragile: Things That Gain From Disorder, Random House Incorporated, 2012.
+[8] E. Vilenski, P. Bak and J. D. Rosenblatt, "Multivariate Anomaly Detection for Ensuring Data Quality of Dendrometer Sensor Networks," Computers and Electronics in Agriculture, vol. 162, pp. 412 - 421, 2019.
+[9] M. Lichtman, M. T. Vondal, T. C. Clancy and J. H. Reed, "Antifragile Communications," IEEE Systems Journal, vol. 12, no. 1, March 2018.
+[10] M. Monperrus, Towards Antifragile Software: Knowledge-driven Perturbation of Software Systems with Active Learning, P Preux, 2016.
+[11] R. Allen, L. Pereira, D. Raes and M. Smith, "Crop Evapotranspiration- Guidelines for Computing Crop Water," FAO Irrigation and Drainage Paper 56, FAO, 1998.
+[12] V. Pendyala, Veracity of Big Data: Machine Learning and Other Approaches to Verifying Truthfulness, Apress Berkely, 2018.
+[13] V. Chandola, A. Banerjee and V. Kumar, "Anomaly Detection: A Survey," ACM Computing Surveys, September 2009.
+[14] L. Cao, C. Kuhlman and E. Rundesteiner, "Distributed Local Outlier Detection in Big Data," Conference Paper, August 2017.
+[15] L. Berti-Équille and J. Borge-Holthoefer, Veracity of Data: From Truth Discovery Computation Algorithms to Models of Misinformation Dynamics, Morgan & Claypool Publishers, 2018.
+[16] C. Crocetta, Theoretical and Applied Statistics, Treviso: Springer, 2015.
+[17] A.-u. Rehman, A. Z. Abbasi, N. Islam and Z. A. Shaikh, "A Review of Wireless Sensors and Networks' Applications in Agriculture," Computer Standards & Interfaces, vol. 36, no. 2, pp. 263-270, February 2014.
+[18] M. Kulbacki, J. Segen, W. Knieć, R. Klempous, K. Kluwak, J. Nikodem,
+J. Kulbacka and A. Serester, "Survey of Drones for Agriculture Automation from Planting to Harvest," IEEE 22nd International Conference on Intelligent Engineering Systems (INES), 2018.
+[19] M. M. Breunig, H.-P. Kriegel, R. T. Ng and J. Sander, "LOF: Identifying Density-Based Local Outliers," Proceedings of the 2000 ACM SIGMOD international conference on Management of Data, pp. 93-104, 2000.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+View publication stats
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt b/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt
new file mode 100644
index 0000000..78390ab
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt
@@ -0,0 +1,202 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Noname manuscript No.
+(will be inserted by the editor)
+Quality model for evaluating and choosing a stream processing framework architecture
+Youness Dendane Fabio Petrillo Hamid Mcheick Souhail Ben Ali
+2019 Jan
+Abstract Today, we have to deal with many data (Big data) and we need to make decisions by choosing an architectural framework to analyze these data coming from dierent area. Due to this, it become problematic when we want to process these data, and even more, when it is continuous data. When you want to process some data, you have to rst receive it, store it, and then query it. This is what we call Batch Processing. It works well when you process big amount of data, but it nds its limits when you want to get fast (or real-time) processing results, such as nancial trades, sensors, user session activity, etc. The solution to this problem is stream processing. Stream processing approach consists of data arriving record by record and rather than storing it, the processing should be done directly. Therefore, direct results are needed with a latency that may vary in real-time.
+In this paper, we propose an assessment quality model to evaluate and choose stream processing frameworks. We describe briey dierent architec- tural frameworks such as Kafka, Spark Streaming and Flink that address the stream processing. Using our quality model, we present a decision tree to sup- port engineers to choose a framework following the quality aspects. Finally, we evaluate our model doing a case study to Twitter and Netix streaming.
+1 Introduction
+More and more data is produced today, and dierent techniques have been developed in order to process this data. Due to modern Big Data applications, like sensors, stock-trading or even user web trac [6] data has to be processed
+Universit du Qubec de Chicoutimi
+Department of Mathematics and Computer science
+555 boulevard de l'Universit
+Chicoutimi, Canada
+E-mail: dendaneys@gmail.com,fabio@petrillo.com,hamid mcheick@uqac.ca,souhail.ben- ali1@uqac.ca
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Title Suppressed Due to Excessive Length 13
+in real-time. The technique that can handle this problem is called : stream processing [5].
+So we have assisted to the rise of Stream processing frameworks, such as Samza and Flink, which are becoming more and more popular, for oering a model to ingest and process data at near real-time [7].
+However, with several stream processing frameworks and technologies associ- ated available, a problem arise : how to choose the right framework ? Each framework has its own features and is more or less dierent from another framework.
+So, depending on the context, you choose the best solution. But another prob- lem occurs here : on what criteria are you basing on to answer this question ? In this paper, we provide a quality model for a decision taking. This model enforced by what we call variables/criteria, can help you through a decision and we see if it is suitable to choose stream processing framework.
+We identify and explain in details four criteria that are important for the framework decision making. Further, we quickly present the selected frame- works with their pros and cons. The criteria and the frameworks have been chosen following a study of stream processing papers. We analyzed these pa- pers, and picked based on an average, the most redundant.
+The rest of the paper is organized as follow, we analyze the related work that has been done (ii), and then answer to the previous questions by identifying what are the dierent criteria you have to base (iii) and by introducing the dif- ferent chosen stream processing frameworks (iv). We propose a decision model tree supported by the previous parts, that you can base on to choose the right framework technology (v).
+2 State-of-the-art/ Related Work
+A stream processing system requires four major elements: (1) Best under- standing of the streaming applications architecture (2) identication of key requirements of distributed stream processing frameworks (DSPF) that can be used to evaluate such a system, (3) survey existing streaming frameworks, (4) evaluation and a comparative study of the most popular streaming plat- forms. We divide the related work based on the three elements mentioned above.
+2.1 Architecture of streaming applications
+Streaming applications architecture is not too much dierent from web archi- tectures. Streaming sources are communicating using arbitrary protocols. So that, a gateway layer is set up to connect sources to streaming application and resolve the heterogeneity of sources protocols. A message queues are set up as a middleware to provide a temporary buer and a routing layer to match the accepted event sources and the applications [11].
+2.2 Requirements of distributed stream processing frameworks
+There are eight rules [12] that serve to illustrate the necessary features required for any system that will be used for high-volume low-latency stream processing applications.
+{ Rule 1: Keep the Data Moving by achieving a low latency
+{ Rule 2: Query using higt level language like SQL on Streams (StreamSQL) { Rule 3: Handle Stream Imperfections (Delayed, Missing and Out-of-Order
+Data)
+{ Rule 4: Generate Predictable Outcomes
+{ Rule 5: Integrate Stored and Streaming Data
+{ Rule 6: Guarantee Data Safety and Availability
+{ Rule 7: Partition and Scale Applications Automatically
+{ Rule 8: Process and Respond Instantaneously
+2.3 Existing streaming frameworks
+Several streaming frameworks have been proposed to allow real-time large scale stream processing. In this section sheds the light on the most popular big data stream processing frameworks:
+2.3.1 Apache Spark [15]
+Developed at UC Berkeley in 2009 [19], is a platform for distributed data processing, written in Java and Scala. In spark, streaming computation is treated as a series of deterministic batch computations on small time intervals.
+2.3.2 Apache Storm [18]
+is a real-time stream processor, written in Java and Clojure. Storm is a fault tolerant framework that is suitable for real time data analysis, machine learn- ing, sequential and iterative computation.
+2.3.3 Apache Flink [17]
+is an open source processing framework supporting both stream and batch, It provides several benets such as fault-tolerant and large scale computation [14]. Multy functionalities are ored by this plateform such us additional high level functions such as join, lter and aggregation it allows iterative processing and real time computation on stream data collected by dierent tools such as Flume [20] and Kafka [21].
+
+Fig. 1 Frameworks comparative
+2.3.4 Apache Samza [16]
+is created by Linkedin to solve various kinds of stream processing requirements such as tracking data, service logging of data, and data ingestion pipelines for real time services [14]. It uses Apache Kafka as a distributed broker for mes- saging, and Hadoop YARN for distributed resource allocation and scheduling [14].
+2.4 A comparative between processing frameworks
+The comparison between those several frameworks listed above are data for- mat, types of data sources, programming model, cluster manager, supported programming languages, latency and messaging capacities [14].
+3 Paper Contribution
+The work reported reported in this paper can be categorized under the class of decision help of choosing a stream processing framework. While there is a rich body of work in designing stream processing applications and huge comparative between these applications, a system that can help you to choose
+the best application by criteria is still messing from contemporary stream processing systems.
+In this paper we discuss some architectural frameworks such as Storm, Spark and others that resolve the Stream processing problem and we pro- vide a a quality model to choose ans evaluate a stream processing framework basing on some criteria such us latency, guarantees, fault tolerance and data processing model.
+4 Survey of Stream Processing Frameworks
+In this section, we will present 4 frameworks that are used actually to resolve stream processing problem.
+4.1 Storm
+Storm integrates with any database (e.g: MongoDB) and any queuing system (e.g: RabbitMQ, Kafka).
+Storm works with tuples. A tuple is a named list of values and can contain any type of object.
+Its API is simple and easy to use due to only three abstractions :
+1. Spout : A spout is a source of streams and reads from a queuing broker.
+2. Bolt : Where most of computation's logic goes. Computation logic can be functions, lters, streaming joins, streaming aggregations etc. So basically, from an input, and with computation logic you can produce new output streams.
+3. Topology : A network of spouts and bolts.
+Storm is scalable, fault-tolerant and have an at-least once guarantee mes- sage semantic. The cons here are that there is not ordering guarantees and duplicates may occur.
+Another of its strengths is if a node dies, the worker will be restarted on an- other node. If a worker dies, Storm will restart it automatically.
+At the date of writing this article, with Storm SQL integration, queries can
+be run over streaming data, but it is still experimental.
+Furthermore, Storm provides an exactly-once guarantee with Trident which is a high-level abstraction. This model is a micro-batch processing model that add a state and will increase latency.
+4.2 Spark
+Spark is an hybrid framework which means it can perform batch as well as stream processing.
+Spark natively works with batch, but it has a library called Spark Streaming
+that can allow to work with near real time data. It means that incoming data
+are regrouped into small batch and then processed without increasing the latency too much unlike Storm which provides true streaming processing.
+One of its power is that the manner you write batch jobs is the same you write stream jobs. More than that, it is fault-tolerant and has an exactly- once semantics.
+Spark has its own modules that you can combine :
+{ Spark SQL
+{ Spark Streaming
+{ Machine Learning
+{ GraphX (for graph programming)
+Spark runs in Hadoop, Apache Mesos, Kubernetes, standalone or in the cloud and access diverse data sources such as HDFS, Cassandra, etc.
+4.3 Samza
+Samza is decoupled in three layers [8] :
+1. Streaming
+2. Execution
+3. Processing
+4.3.1 Streaming
+For the message queuing system, Samza uses Kafka. Kafka is a distributed pub/sub and it has an at-least once message guarantees. Kafka consumers subscribe to topic, which allow them to read messages.
+4.3.2 Execution
+Samza uses YARN to run jobs. It allow to execute commands on a cluster of machines after allocating containers. This is made possible because of YARN, which is the Hadoop's next generation cluster scheduler. So, YARN provides a resource management and task execution framework to execute jobs.
+4.3.3 Processing
+It uses the two layers above; input and output come from Kafka brokers. YARN is used to run a Samza job and supervise the containers. The processing code the developer write runs in these containers. Samza's processing model is real time.
+One of Samza's advantages is that the streaming and execution layers can be replaced with any other technologies. Also, because of the use of YARN,
+Samza is fault tolerant; Samza works with YARN to transparently migrate tasks to another machine.
+The processing model Samza provides are both batch and stream (real time). Whatever the code you write, it will be reusable whatever the model. Switching models needs cong change; from HDFS to Kafka to pass from batch to stream processing.
+4.4 Flink
+Flink supports batch and real-time stream processing model. It has an exactly- once guarantee for both models. Flink is fault-tolerant and can be deployed to numerous resource providers such as YARN, Apache Mesos and Kubernetes; but also as stand-alone cluster.
+One of the advantages of this framework is that it can run millions of events per seconds by using the minimum of resources, all of this at a low latency. Flink provides three layered API's :
+1. ProcessFunction : It implements the logic, process individuals or grouped events and give control over time and state.
+2. DataStream : Provides primitives for stream operations such as transfor- mations. It is based on functions like aggregate, map and reduce.
+3. SQL : To ease the writing jobs for analytics on real time data.
+5 Criteria used in frameworks
+To choose a stream processing framework, we have identied some criteria. These criteria don't give you the answer on whether you should use stream processing or batch processing, but rather helps you take the decision to pick the right framework. So this step assumes that you already identied the problem and you came to the idea that should use stream processing model over batch processing.
+We rst are going to give the criteria and explain them in details :
+{ Latency
+{ Message semantics (guarantees)
+{ Fault tolerance
+{ Data processing model (micro-batch or real-time)
+5.1 Message semantics
+Another term referring to this criteria is Message guarantees. The message guarantees can take three forms :
+{ At least-once : could be duplicates of the same message but we are sure
+that it has been delivered
+{ At most-once : the message is delivered zero or one time
+{ Exactly-once : the message is guaranteed to be delivered exactly one and
+only one time
+Before providing message guarantees, system should be able to recover from faults. [6]
+5.2 Fault tolerance
+Streaming application run for an indenite period, so it increases the chance of having faults. So this criteria is important, because despite the application has faults.
+Fault tolerance guarantees that the system will be highly available, operates even after failures and has possibility to recover from them transparently. Flink has the highest availability.
+5.3 Latency
+Latency is the time between arrival of new data and its processing [10]. La- tency goes hand in hand with recovery (fault tolerance) because, whenever the system has errors, it should recover fast enough so the latency doesn't de- crease too much (i.e : the processing continue with minimal eect). Also, each framework can do do some optimization on data such as message batching, to improve the throughput, but the cost is sacricing latency.
+5.4 Data processing model
+To do stream processing, there is two techniques :
+{ Micro-batch : based on batch processing but rather than processing data
+that have been collected over previous time, data is packaged into small batches and collected in a very small time intervals and then delivered directly to the batch processing. Spark for example does micro-batch.
+{ Real-time : data is processed on y as individual pieces, so there is no
+waiting. Flink process data in real-time.
+As messages are received directly the real-time processing technique has a lower stream processing latency than micro-batch but it become harder to have an exactly-once semantics. However, micro-batch provides better fault- tolerance and thus it can guarantees that the message has been received only once (i.e : Spark Streaming).
+What we understand here is that message semantics are related to the fault tolerance and the data processing model, and according to how the fault tolerance is implemented the latency will increase or decrease.
+
+Fig. 2 Frameworks per paper
+
+Fig. 3 Criteria per paper
+6 Quality Model for choosing and evaluating a SPF
+After presenting the dierent frameworks and found the main characteris- tics/criteria, we came with a model. A model for evaluating the frameworks and choosing one given a set of criteria. In this section, we explain why we have chosen these particular frameworks and how we extracted certain crite- ria. Afterward, we explain how we have prioritized the criteria, and then, with all these information we present the quality model.
+6.1 Methodology
+There is several processing frameworks used in production today. But to nd
+out what framework is used in which company is dicult and take time. So, our primary support was the research papers. We analyzed various papers about stream processing, and we dened redundancy as our benchmark. This means that we made a table with the papers and frameworks, and every time a paper cited a framework we gave a point to the paper. At the end, we had a table with the frameworks cited per paper.
+We repeated the same process for the criteria. The result is on gure 3.
+This paper is a rst draft, and we plan to study more papers to have more criteria and frameworks, and thus, to have better average results.
+6.2 Choosing and prioritizing the criteria
+After nding the criteria, we had to prioritize them. Here is the criteria ranked by importance.
+1. Data model
+2. Fault tolerance
+3. Message semantics
+4. Latency
+The rst decision is what type of stream processing to choose, because this will have an impact on the other criteria. If you choose a micro-batch framework, it will be possible to have for each framework an exactly-once message semantics as opposite to a real-time model.
+Latency is of great importance, but, a framework should be able to recover fast enough, so it does not aect the system too much (with minimum time). And before providing message semantics it also should be recover from faults automatically. Because it will inuence the other criteria beneath it, this is why the fault tolerance is in second position.
+Depending on whether it is exactly-once or at least-once message semantics, the latency will change depending this criteria.
+6.3 Decision Model Tree
+Based on the previous parts, we present the decision model tree to evaluate and choose a stream processing framework (g. 4).
+7 Case studies
+In this section, we analyze some stream processing application cases. We go through two companies : Netix and Twitter.
+The goal of this section is to see if our contribution in this paper correspond to the reality (i.e: real world application). In analyzing how and why these companies use stream processing frameworks, we can identify the main under- lying elements and compare them to our criteria. We get all information from papers and the companies tech blog.
+7.1 Twitter
+Twitter has actually an in-house framework called Heron. But before that, they were using Storm. We are going to detail framework evaluation for Storm, because Heron is an improvement but they are still using what we detail below.
+The company that has made Storm was acquired by Twitter in 2011. Since, Twitter modied for their use.
+
+Fig. 4 The decision model tree
+Let's begin with our rst criteria : data processing model. At Twitter, due to choosing Storm, as we described it above, it has a micro-batch processing model. So, just by using it, the choice of data processing model has been made. We go now to our second criteria : fault tolerance. When Twitter describes Storm [18], they say that one of the argument chosen to design Storm is : resilient (i.e : fault tolerant); their second criteria and ours correspond. As they say in the article [18], on of the feature key is the processing semantics or message semantics. They describe that their solution has two guarantees : at least once and at most once. This characteristic correspond to our third criteria we have mentioned. Further in the article, Ankit et al. report some experiment they have made that had to show the latency results. As they calculated, their latency is close to 1ms 99% of the time. Our criteria are justied by the design and the use of Storm at Twitter.
+In this rst subsection, we can conclude that our criteria are match with the main characteristics of design and use of Storm at Twitter.
+7.2 Netix
+In their article [22], they describe Keystone which is their stream processing platform. The solution chosen to do stream processing is Apache Flink. By choosing Flink, they automatically chosen the real-time processing for the data model criteria. Then, they gave a summary of common asks and trade-os and one of them is failure recovery. This correspond with our criteria. One of the
+asks was that the system is fault tolerant. If we follow our model, the next step is to choose the message semantics. In the post, their say that according to the use case loosing some events in the pipeline is acceptable while in other cases the event have to absolutely processed so it require a better durability. We see that this sentence is a synonym to our message guarantees criteria. In another post [23], they describe this time a real use case : to know what is trending on Netix. In order to that, they need real-time data of what users watch, the event is then send to be processed. They describe that one of their challenges was having a low latency. This last criteria match with ours.
+What we can conclude in this section is that these companies followed a path which correspond with our quality model. All our criteria had been taken into account by these companies and are part of the core decision on choosing and using stream processing framework architecture.
+8 Discussion
+In this section we will discuss the impact of our results, impact as well on engineers as on researchers. This quality model can be used as a guideline when wanting to choose a stream processing framework. Answering what type of criteria is important for a given context will end to the choice of the right solution; do I need absolutely only one instance of data or is it permissible to have duplicates ? (i.e: at least once vs exactly once semantics). Answering to these questions based on the criteria we identied will help the engineers make the right choice quicker. Further, the use case of our model is not lim- ited to the choice only. Our model can be extended to serve to design a future stream processing framework architecture. When designing the solution, the model can help to see further steps on what will be implemented and thus the dierent dependencies it will have : when implementing the fault tolerance, the latency will increase or decrease given on how it is implemented. More over, thanks to the model, we see that the fault tolerance will also inuence the message semantics. So based on what we want to have as message guaran- tees, we will implement the fault tolerance in a dierent manner. In the other hand, researchers can use this model when wanting to evaluate a framework architecture. Also, this model, can be reused in order to compare dierent frameworks. When wanted, as part of their research, they can have a quicker and a better view on the dierent solution and what brings to them and how they are dierent and also similar. More over, when wanted and depending on their need, they can easily extend this quality model in order to adapt it to their work : adding a criteria will add complexity, and thus a possible dierent path.
+9 Conclusion & Future work
+With the huge amount of data generated, and given a stream processing con- text, choosing the right framework architecture is major. In order to do that,
+we rst identied and explained what are the dierent criteria such as data model and latency... and presented some stream processing frameworks. We explained our methodology on how we came to choose the ideal framework ar- chitecture to fulll user's needs. Given these, we provided a decision model tree which is a quality model to choose and evaluate a stream processing frame- work.
+There is more work that has to be done, in order to have more criteria and frameworks, thus to have a more complete and complex model. We can base on this model to evaluate and choose a framework architecture, and not only that, this model can also serve as a guide to designing a new stream process- ing framework architecture. It can also be used as a support to have quickly a global view of the dierent solution and what brings to them depending on the dierent criteria.
+References
+1. http://storm.apache.org
+2. http://spark.apache.org
+3. A Framework for Real-time Streaming Analytics using Machine Learning Approach, Proceedings of National Conference on Communication and Informatics-2016
+4. http://kafka.apache.org
+5. Michael Stonebraker, Uur etintemel, Stan Zdonik. The 8 requirements of real-time stream processing. ACM SIGMOD Record Homepage archive, Volume 34 Issue 4, De- cember 2005, Pages 42-47.
+6. Supun Kamburugamuve and Georey Fox : Survey of Distributed Stream Processing.
+7. Fangjin Yang, Gian Merlino, Nelson Ray, Xavier Laut, Himanshu Gupta, Eric Tschetter
+: The RADStack: Open Source Lambda Architecture for Interactive Analytics.
+8. http://samza.apache.org
+9. http://ink.apache.org
+10. Andre Luckow, George Chantzialexiou, Shantenu Jha. Pilot-Streaming: A Stream Pro- cessing Framework for High-Performance Computing
+11. Supun Kamburugamuve, Georey Fox : Survey of Distributed Stream Processing
+12. Michael Stonebraker, Uur etintemel, Stan Zdonik: The 8 Requirements of Real-Time Stream Processing
+13. Karan Patel, Yash Sakaria, Chetashri Bhadane : REAL TIME DATA PROCESSING FRAMEWORKS
+14. Wissem Inoubli, Sabeur Aridhi, Haithem Mezni, Mondher Maddouri, Engelbert Nguifo
+: A Comparative Study on Streaming Frameworks for Big Data
+15. Apache Spark. Apache spark: Lightning-fast cluster computing, 2015
+16. Apache Samza. Linkedins real-time stream processing framework by riccomini 2014
+17. Apache Flink. Scalable batch and stream data processing, 2016
+18. Ankit Toshniwal, Siddarth Taneja, Amit Shukla, Karthik Ramasamy, Jignesh M Patel, Sanjeev Kulkarni, Jason Jackson, Krishna Gade, Maosong Fu, Jake Donham, et al : Storm @Twitter. In proceedings of the 2014 ACM SIGMOD International Conference on Management of Data, Pages 147-156
+19. Matei Zaharia, Mosharaf Chowdhury, Michael J Franklin, Scott Shenker, and Ion Stoica. Spark: Cluster computing with working sets. HotCloud, 10(10-10):95, 2010
+20. Craig Chambers, Ashish Raniwala, Frances Perry, Stephen Adams, Robert R Henry, RobertBradshaw, andNathanWeizenbaum. Flumejava: easy, efcientdata-parallel pipelines. In ACM Sigplan Notices, volume 45, pages 363375. ACM, 2010
+21. Nishant Garg. Apache Kafka. Packt Publishing Ltd, 2013
+22. https://medium.com/netix-techblog/keystone-real-time-stream-processing-platform-a3ee651812a
+23. https://medium.com/netix-techblog/whats-trending-on-netix-f00b4b037f61
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt b/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt
new file mode 100644
index 0000000..ec0514f
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt
@@ -0,0 +1,115 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2020 IEEE 5th International Conference on Cloud Computing and Big Data Analytics
+Big Data Oriented Light-Load Embedded Performance Modeling
+Jinfeng Dou Jiabao Cao
+College of Information Science & Engineering Department of Research and Development Ocean University of China Qingdao 266100, China Nokia Corporation
+e-mail: jinfengdou@ouc.edu.cn Qingdao 266100, China
+e-mail: william.cao@nokia-sbell.com
+Xin Li, Lijuan Wang, Shuya Tang
+College of Information Science & Engineering
+Ocean University of China
+Qingdao 266100, China
+e-mail: 450751328@qq.com, 296189725@qq.com, tangshuya1995@163.com
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract—With increasing development of big data, the performance assessment and optimization face with a big challenge. The traditional methods widely use delivery-testing- analysis-solving (DTAS) ring. In big data area, big data environment is necessary for the testing phase in DTAS, which results in the big cost in both time and hardware. This paper proposes the big data oriented light-load embedded performance modeling. It ascertains the performance criteria to set the Capacity and Performance (C&P) factors. These factors will be embedded into the software with an on-off switch during the architecture, design and developing phases before DTAS phase. After the software coding done with embedded C&P factors, a small traffic load is run to collect the C&P data. The collected data will be used for the performance bottleneck finding, performance optimization, and forecasting the capacity and performance for various customers’ scenarios. Since the data easily help locate the issue, the required running traffic is small, and the problem solving is done before the traditional DTAS, this study is more suitable for the big data application. It can save more than 50% of time, decrease the software development efforts, and reduce the lab resources occupation. Finally, the proposed method is employed in the real prototype of an Internet of Things application, obtains the better capacity and performance, and the experiment data verify its effectiveness.
+Keywords-Big data; capacity and performance; light-load; performance modeling; performance optimization
+I. INTRODUCTION
+With more and more fields applying Big Data and Internet of Things (IOT), the performance assessment and optimization of the software system face with a big challenge [1]. The capacity and performance (C&P) is the base and specific to the software system [2]. Take an example, the closure of issues in GitHub projects and the model of issue closure rates proposed cares about an improved understanding and prediction of the important measure of the development process performance [3]. An abundance of data in many disciplines of science, engineering, national security,
+health care, and business has led to the emerging field of big data analytics (BDA) that run in a cloud computing environment [4].
+Applying traditional performance assessment and optimization, delivery-testing-analysis-solving (DTAS) ring, into the big data application has some problems, such as low efficiency, big testing and debugging effort and complex expensive environment. In the traditional ways, the performance engineering almost depends on the performance tester’s testing and lots of debugging again and again [5]. To process the emerging field of BDA that run in a cloud computing environment, the developers leverage Data- Intensive Scalable Computing (DISC) systems such as Google’s MapReduce, Hadoop, and Spark. While the developers have no easy means to debug DISC applications [6]. It still need lots of testing and debugging day and night with massive test cases for the coverage of big data.
+Various call models are usually used when deploying a software in the customer site. It is composed of some kinds of scenarios with corresponding weights. In some C&P work [7-8], to identify the C&P of one call model, the testing work need be done again and again to find its top capacity and throughput. Moreover, various customers may have various call models. Then the testing work will take lots of lab sessions which mean a lot of human resources, a lot of lab equipment, a lot of power consumption, a lot of lab space occupation, etc.
+To reduce the testing and debugging cost in time and environment for C&P monitor and optimization, some performance testing tools are introduced, e.g., Insure++ for the software by C/C++; Jcontract and Jprofiler for the software by Java; XHProf for the software by php. These kinds of C&P tools can help with debugging. However, it still needs repeated testing and complex expensive environment.
+This study proposes the performance modeling based lightweight embedded C&P method (LECPM). The LECPM embeds C&P factors for the C&P monitor and statistics in the software interior. With a lower load running, e.g. 10% of
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+978-1-7281-6024-5/20/$31.00 ©2020 IEEE 476
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+required traffic, the performance engineer can get the C&P statistics and analysis for the software, find and resolve the bottlenecks and related problems before delivering to integration testing. Since the used load is small, a lot of lab resources can be saved, and repeated testing can be reduced as a lot of lab sessions will be saved. Much earlier the bugs are found, much less the development and maintenance efforts will be.
+II. RELATED WORK
+A performance testing method for embedded software platforms was described, which analyzed the performance constraints of the platform to improve software quality and performance into account during early development stages, test system reliability [9]. The model allowed to take as well as to perform regression testing. The study modeled a system process based on load testing and profiling data to produce representative workloads, create profiler snapshots, and get performance hotspot reports [10]. The performance issues are identified and matched with the specification of antipatterns. A formalism, stochastic performance logic, represented performance requirements, which can identify performance differences in realistic unit test scenarios [11]. An automated approach, PerfLearner [12], extracted execution commands and input parameters from descriptions of performance bug reports, and used them to generate test frames for guiding actual performance test case generation. The study used a declarative domain specific language (DSL) drive the end-to-end process of executing performance tests [13]. A model-driven framework can specify the performance intentions by relying on a powerful target-oriented language. A systematic literature review identified 208 fault prediction studies published from January 2000 to December 2010 [14]. The methodology used to build models seems to be influential to predictive performance. A software model can be analyzed for nonfunctional requirements by extending it with suitable annotations and transforming it into analysis models for the corresponding nonfunctional properties [15]. Communication Sequential Processes (CSP) and the model checker Process Analysis ToolKit (PAT) [16] modeled and verified the OpenFlow scheduled bundle mechanism in software defined networking (SDN), which guaranteed the completeness and consistency of messages transmitted between SDN switches and controllers during the communication process.
+Some study gives the method to resolve part of the performance issues. Most study almost depends on the performance tester’s testing and lots of debugging again and again, and most performance is mainly about fault finding. The testing work will take lots of lab sessions. Various customers may have various call models, so many similar call models need repeated testing, and these testing will take huge of these resources. This paper introduces the performance modeling that helps engineer find C&P related problems before delivering to integration testing, and reduce the development and maintenance efforts.
+III. LIGHT-LOAD EMBEDDED PERFORMANCE MODELING AND CASE STUDY
+We propose LECPM to use low traffic to get the C&P factors composing of the performance engineering base, C&P data. The C&P factors may include the external resources and internal resources, such as CPU, shared memory, message queue, global objects, etc. With these base C&P data, we can compose any call model and give the estimation for each call model for the validation, hence much testing work will be reduced. The C&P data will also clearly show the critical point of the capacity and performance, so the related problems can be much easier found, analyzed and resolved. Moreover, the work in LECPM is done before DTAS, much earlier the bugs are found, much less the development and maintenance efforts will be.
+The performance engineering designates and validates the C&P data, provides the resolutions to optimize the system C&P, and implement the call model engineering with forecasting the system C&P. The LECPM can use the base C&P data but not the personal experience as the chief gauge, which is a much more scientific way. This engineering requires the performance engineer to involve the software development from the beginning of the system requirements analysis. The performance engineer need work with the system engineer to analyze the requirements, work with the architect to be familiar with the software architecture and to give the performance related comments to the architect, need start to write code in the early phase of software framework design and coding, and will start the performance initial analysis after the software framework done and before the functionality implementation. The detail work flow is shown in Fig. 1. It covers embedding C&P factors, C&P statistics and optimization, and C&P forecast. In this section, we will demonstrate how performance modeling is, how is it done, and finally we use the experiment data to verify it.
+Figure 1. The performance modeling work flow
+A. Performance Modeling Base-AASI
+The base of performance modeling is the abundant C&P data. The C&P data is conditionally embedded into the software. The embedding work has 4 steps named AASI in Fig. 2. They are: Ascertain specific C&P factors, Analyze the software architecture and split it module by module and
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+477
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply.
+
+interface by interface, Specify the C&P data, and Implement the embedding of the C&P factors and the statistics of the C&P data in the software. The prior 3 steps are called AAS.
+Figure 2. AASI model
+
+Figure 3. The CPU variation with different traffic
+The C&P factors include the exterior resources and interior resources. The exterior resources are common to all kinds of software; they may be CPU usage, shared memory, network bandwidth occupation, the disk usage, the DB resources, etc. The interior resources are specific to the certain software, may be message queue, some certain global objects, count of threads, etc. The C&P factors may be some of them which depend on the software’s usage scenario and architecture characteristics.
+Here we need study the specific software architecture. Any software can be modularized, and the modules communicates with each other using the public or private interfaces, and some modules may also communicate with external resources or third party applications using public interfaces. These interfaces may be some global objects, some message protocols, the files, the shared memories, DB objects, etc.
+In addition to the C&P factors ascertainment, modularization and interfaces identification, the software application scenarios need to be identified. What we should do is to identify each single scenario. All of them will be used to specify the C&P data. Actually any above C&P factors can be used for the C&P data. The C&P data could be like the CPU time used in one module and/or in one message, it can be counted with average value in a certain time, or be counted with the total value in a certain time. The experiment shows that the average value in a certain time is much more useful and much easier to be compared and to be analyzed. The network bandwidth can also be as the C&P
+data. We can count the messages size in a certain time when they are transferred between the modules or between the module and external network element. They can be shown finally as the network bandwidth statistics. If the message queue is used in the software to have the modules interior communication, the message queue status need be taken as the C&P factor; it can be the size of queue, or be the hold time for the queue. Take one more example, in some software, some global object is used to be the critical shared resources among some modules, then it must be used for the C&P data. The performance engineer may care about its total size any time, or about its variation trend. The final step, the embedding implementation, is to apply the above analysis and design into the deployed software. Definitely it should be a feature of this product, and it also has the common software development cycle. It should be enabled or disabled easily, and it will only be used in the development lab. It will not take effect in the site, and will not and should not have any impact to the software when deployed in site. For the implementation, it is suggested that in the early development phase, i.e., once the software architecture is designed, these C&P data should be embedded into so that it can validate that the software adopts and implement a healthy architecture.
+B. C&P Monitoring and Optimization
+The software C&P is measured with the data of traffic throughput under the certain CPU level. We often set the CPU level as 45% or so for the max normal load in most healthy software especially related to the human behaviors, and before the CPU usage reaches at 40~50%, the CPU usage variation is linear with the traffic, as is verified in the experiment, shown in Fig. 3. The probability of the certain traffic load occurrence is following the Poisson distribution [17]. In probability theory and statistics, the Poisson distribution is a discrete probability distribution that expresses the probability of a number of events occurring in a fixed period of time if these events occur with a known average rate and independently of the time since the last event. For example, suppose there is a telecommunications application, this application is serving people the communications. In the dimension of time, the communications traffic sometime is busy, and sometime is idle, we can say that the traffic occurrence follows the Poisson distribution. What we want to ensure is that the system works with a good criterion (e.g. 99.999% successful rate) when the traffic load is not greater than the most possible traffic load (with the biggest possibility) per the Poisson distribution theory; and may allow more errors when the traffic load is much greater than this value and reaches at its top, which is defined by the product manager or by the customer. For a healthy and economic software, the CPU usage under the above stated traffic load is 40~50% so that it can be tolerant of peak traffic load with enough CPU space.
+With above analysis, we will monitor that how many traffic throughput is supported by the aimed software under 45% CPU usage. And how big is its supported capacity. Here we will get the CPU time, global objects status, and corresponding memory occupation for each typical single scenario, which are the C&P data base. These kinds of data
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+478
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply.
+
+are what we should monitor. In the performance modeling, we can first use 2 or 3 little call load to get the base, and then with these data and the linear variation below 45% CPU usage to evaluate the rough call load under 45% CPU, finally validate it. So the overall testing effort will be much reduced.
+It is recommended to implement the performance modeling in the software early development as shown in above Fig. 1. Thus in the early development phase, the system performance related problems will be found early. How are they found? In above sessions, this paper stated that the CPU time will be counted in each module, all the message queue status will be monitored, and the global objects variation trend will also be tracked. After analyzing these C&P data, we will compare the CPU time and analyze its reasonability by each module. If the module A takes about 2% CPU time, however, the similar module B takes about 20% CPU time, then we can say that there is something wrong in module B. Moreover, if each message handling takes about 1 second in module C, we can say that module C is abnormal since the message handling should only consume the millisecond level. With the tracked global objects variation trend, if it is not flat but increasing, we can judge that there is some memory leak for these global objects. For the message queue, when using a higher call load, the message queue size increases for module B, we can say that module B has little ability to handle its messages; its ability need be improved by either multiple threads or by enhancing processing capacity of the single thread. We can see that this kind of optimization takes less effort than the traditional methods, and can be verified easily. With this method, the capacity issue can be easily found, and the developers can also check if the new code involves capacity issues using the less-effort performance modeling testing.
+In one real case, shown in Fig. 4, we developed a typical web server with database in an IOT application, which serves the end user for the http request including data query and input, and for the http notification of the received IOT data. The performance modeling method is used in this product to find the capacity issues so as to resolve them. This software uses the average processing time and the average awaiting time as the C&P data. As shown in Fig. 5, we can see that the average awaiting time in the module DataProcessingModule is abnormal, and the average processing time in the modules DataProcessingModule and DBWriteModule are abnormal. The average awaiting time value of other modules is 100 or so, however, the DataProcessingModule is greater than 1000. Most of the average processing time is about 300 or so, and DataProcessingModule and DBWriteModule are greater than
+1000. With the software architecture analysis, the abnormal data in DBWriteModule is caused by the database update operation which is reasonable and acceptable. What we should resolve is DataProcessingModule. The awaiting time means that the messages put into this module can’t be handled immediately. The awaiting time is close to the processing time in DataProcessingModule, after analyzing the software architecture, we find that this module is a single thread, the later coming messages must be wait until the
+previous messages completes. So we change this module to be multiple threads to resolve this issue. For the big average processing time in this module, we note that the logic in DataProcessingModule is the memory operation but not disk operation, so the big processing time is unreasonable. After comparing with the initial C& P data without functionality applied, we found that the pure software framework is excellent in this module. With the quick temporary C&P factor added and test, it is found that one system call related to the time is called, which consumes a big CPU time. The final enhance work and the testing results on these enhancement shows that the system is healthy with good C&P data.
+C. Call Model Engineering Based on C&P Forecast
+The call model definition or requirements mainly comes from the customer sites or from the product manager. When the software is deployed in the customer sites, various customers will have various kinds of call models, and even the same customer will have different call models in the different period. The performance engineering based performance modeling provides an easy way for the call model engineering, which avoids doing much test and saves much effort. This call model engineering is to forecast the C&P based on the C&P data of each single scenario together with the software architecture decomposition data, such as the module hit of each single scenario.
+Figure 4. The Web Server software modules and interfaces
+
+Figure 5. The initial C&P data
+
+Figure 6. The C&P forecast and real test result comparing
+
+Figure 7. The module hit of each single scenario
+Let’s continue to use the web server with database in an IOT application as the example. One customer needs the call scenario with 200 tps (transaction per second) of query + 500 tps of IOT data report, and wants to know the hardware requirement. As shown in Fig. 6, we have had the C&P data of each single scenario, query only and IOT data report only.
+With the software architecture decomposition, each single scenario has the module hit data show in Fig. 7. Fig. 7 indicates how many times each module is called per scenario. We estimate the draft CPU usage according to the subtotal of the time of each module as shown in Fig. 6 and the given tps in each single scenario. The estimation method is:
+First get the estimated subtotal in certain module: The estimated subtotal in certain module = * + * . By the way, we can also get the draft average time using the equation: average time = /.
+Then the estimated CPU usage can be calculated using method: ((CPU usage by query only + CPU usage by IOT data report only)/2) * (((< total time of query> + < total time of IOT data report)/2)/< total time of the estimated subtotal>.
+Finally what we estimated by this engineering method is that 100 tps of query + 500 tps of IOT data report need 63% CPU. The official supported top CPU is 45%, so we need deploy 2 instances of the server platform to support the customer. The experiment validated that this engineering method is close to the real testing result.
+IV. CONCLUSIONS
+Generally, the performance modeling proposed a better method of the performance engineering. With this method, the C&P factors were embedded into the software architecture, which helped the performance engineer easily nail down the capacity issue with little temporary debugging
+code since the C&P data gives detail, helped the performance engineer quickly get the C&P data for the specific call models, and could help the developer quickly find if the new change on the software has capacity issue. These explicit is suitable for the big data background. It benefits save a lot of development effort and raise the product competitiveness. The future research will be on how to implement a common implant and how to study the general estimation tool.
+ACKNOWLEDGMENT
+This work was financially supported by the Shandong Natural Science Foundation (ZR201702170341) and Postgraduate Education Quality Improvement Program (HDYJ18008).
+REFERENCES
+[1] Q. Liu, Y. J. Fu, G. Q. Ni, J. M. Mei, “Big Data Management Performance Evaluation in Hadoop Ecosystem”, 2017 3rd International Conference on Big Data Computing and Communications (BIGCOM), Chengdu, China, pp.413-421, 10-11 Aug. 2017.
+[2] B. Boehm, “Improving and Balancing Software Qualities”, 2016 IEEE/ACM 38th IEEE International Conference on Software Engineering Companion, Austin, TX, USA, pp. 890-891, 14-22 May 2016.
+[3] J. Oskar, J. Szymon, W. Adam, P. Kamil, J. Michal, “Surgical teams on GitHub: Modeling performance of GitHub project development processes”, Information and Software Technology, vol. 100, Aug 2018, pp. 32-46.
+[4] F. Xu, H. Zheng, H. Jiang, W. Shao, H. Liu, Z. Zhou, “Cost-effective cloud server provisioning for predictable performance of big data analytics”, IEEE Transactions on Parallel and Distributed Systems, vol. 30, n. 5, pp. 1036-1051, May 1, 2019.
+[5] J. Y. Wang, “An imperfect software debugging model considering irregular fluctuation of fault introduction rate”, Quality Engineering, v 29, n. 3, July 2017, pp. 377-394.
+[6] M. A. Gulzar, “Interactive and Automated Debugging for Big Data Analytics”, 2018 IEEE/ACM 40th International Conference on Software Engineering: Companion, Gothenburg, Sweden, pp. 509- 511, May 27 - June 03, 2018.
+[7] O. Jarczyk, S. Jaroszewicz, A. Wierzbicki, K. Pawlak, M. J. Lorek, “A software quality framework for large-scale mission-critical systems engineering”, Information and Software Technology, vol. 102, October 2018>*pp. 100-116.
+[8] R. Riccardo, Z. Lamberto, F. Alberto, A. Ilan, “Big data analytics capabilities and performance: Evidence from a moderated multimediation model”, Technological Forecasting and Social Change, vol. 149, December 2019.
+[9] A. Shen, M. Kuzlu, M. Pipattanasomporn, S. Rahman, L. Chen, “ A performance testing method for embedded software platforms”, 2016 IEEE International Conference on Cyber Technology in Automation, Control, and Intelligent Systems (CYBER), Chengdu, China, pp.135- 140, 19-22 June. 2016.
+[10] C. Trubiani, A. Bran, A. Hoorn, A. Avritzer, H. Knoched, “Exploiting load testing and profiling for Performance Antipattern Detection”, Information and Software Technology, vol. 95, March 2018, pp. 329- 345.
+[11] B. Lubomír, B. Tomáš, H. Vojtěch, K. Jaroslav, M. Lukáš, T. Tomáš,
+T. Petr, “Unit testing performance with Stochastic Performance Logic”, Automated Software Engineering, vol. 24, n. 1, March 2017, pp. 139-187.
+[12] X. Han, T. T. Yu, D. Lo, “Perflearner: Learning from bug reports to understand and generate performance test frames”, ASE 2018 -
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+480
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply.
+
+Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering, Montpellier, France, pp. 17-28, 3-7 September 2018.
+[13] F. Vincenzo, P. Cesare, “A declarative approach for performance tests execution in continuous software development environments”, ICPE 2018 - Proceedings of the 2018 ACM/SPEC International Conference on Performance Engineering, Berlin, Germany, pp. 261- 272, 9-13 April 2018.
+[14] T. Hall, S. Beecham, D. Bowes, D. Gray, S. Counsell, “A systematic literature review on fault prediction performance in software engineering”, IEEE Transactions on Software Engineering, vol. 38, n. 6, pp. 1276-1304, 2012.
+[15] M. Woodside, D. C. Petriu, J. Merseguer, D. B. Petriu, M. Alhaj, “Transformation challenges: from software models to performance models”, Software and systems modeling, vol. 13, n. 4, pp. 1529- 1552, 2014.
+[16] H. W. Wang, H. B. Zhu, L. L. Xiao, W. L. Xie, G. Lu,” Modeling and Verifying OpenFlow Scheduled Bundle Mechanism Using CSP”, 2018 IEEE 42nd Annual Computer Software and Applications Conference (COMPSAC), Tokyo, Japan, pp. 376-381, 23-27 July 2018.
+[17] I. Ruiz-Rube, J. M. Dodero, R. C.Palacios, “A framework for software process deployment and evaluation”, Information and Software Technology, vol. 59, pp. 205-221, 2015.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+481
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt b/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt
new file mode 100644
index 0000000..796daab
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt
@@ -0,0 +1,198 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2019 Developments in eSystems Engineering (DeSE)
+Data Quality Management for Big Data
+Applications
+ Majida yaseen khaleel Prof. Dr. Murtadha M. Hamad
+ Department of Computer Science D eUpanrivtmeresnittyoof fCAonmbpaurte r Science University of Anbar
+ Ramadi, Iraq Ramadi, Iraq majdhsyasyns@gmail.com dr.mortadha61@gmail.com
+ Abstract— Currently, as a result of the continuous increase Several Data Warehouses (DWs) were developed in of data, one of the key issues is the development of systems and different fields. Nevertheless, today's DWs face new applications to deal with storage, management and processing scientific problems. Heterogeneous, independent, scalable of big numbers of data. These data are found in unstructured and distributed are the current sources of data. With the ways. Data management with traditional approaches is difficulties involved, the traditional data warehouse faces inappropriate because of the large and complex data sizes. some constraints, summarized with the following sentence: Hadoop is a suitable solution for the continuous increase in non-existence of scalability owing to problems in data sizes. The important characteristics of the Hadoop are processing combined with natural data. Data nature: new distributed processing, high storage space, and easy semi-structured and unstructured data models and formats administration. Hadoop is better known for distributed file
+systems. In this paper, we have proposed techniques and have created the need for modern data warehouses to be algorithms that deal with big data including data collecting, integrated and used, but traditional DW can not.
+data preprocessing, algorithms for data cleaning, A We have proposed a technique for converting Technique for Converting Unstructured Data to Structured unstructured data to structured data using metadata , Data using metadata, distributed data file system
+(fragmentation algorithm) and Quality assurance algorithms distributed data file system (Fragmentation algorithm) and by using the model is the statistical model to evaluate the quality assurance algorithms that decrease above highest educational institutions. We concluded that Metadata limitations and the summation of total query maintenance accelerates query response required and facilitates query cost and response time of the selected views which is execution, metadata will be content for reports, fields and regarded the view selection problem.
+descriptions. Total time access for three complex queries in
+distributed processing it is 00: 03: 00 per second while in non- II . BIG DATA DEFINITION
+distributed processing it is at 00: 15: 77 per second, average is The term big data refers to a huge amount of information approximately five minutes per second. Quality assurance that comes from several sources. Therefore big data do not note values (T-test) is 0.239 and values (T-dis) is 1.96, as a
+result of dealing with scientific sets and humanities sets. In the only refer to this huge volume of data but also the variety comparison law, it can be deduced that if the t-test is smaller of data forms, which are supplied at different speeds [2]. than the t-dis; so there is no difference between the mean of By 2020,there will be around 20-100 billion connected the scientific and humanities samples, the values of C.V for devices leading to more data collection; thus illustrating both scientific is (8.585) and humanities sets is (7.427), using a necessity for applying big data analytics [3]. This takes the law of homogeneity know whether any sets are more forth the requirement of understanding big data. See Fig homogeneous whenever the value of a small C.V was more 1.[4].
+homogeneous however the humanity set is more homogeneity.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Keywords— Big Data, data quality, unstructured Data Distributed data file system, and statistical model.
+I. INTRODUCTION
+Currently, large data volumes appear unprecedented in heterogeneous sources (eg Commercial and educational, finance). The proliferation of smart computers and Internet of things will make them a very technical nature . Strong systems and distributed programs behind the scenario support multiple overlapping systems (for example, smart grid systems [1].
+ Until the big data revolution, traditional technology lacks high storage capacity, keeping all the archiving for a long time and running large data since large data comes from different sources so we need ways to deal with it, big data needs massive data sets to be cleaned, processed, analyzed, secured, and textured. Analysis of data in companies and industries is becoming increasingly important for competing, finding new ideas and personalizing their services. [1]
+
+Fig. 1.volume versus variety
+A. Reasons for Appearance of Big Data
+ Recently, there have been some things that have helped this explosion and increase in size and diversity, including:
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+978-1-7281-3021-7/19/$31.00 ©2019 IEEE 357
+DOI 10.1109/DeSE.2019.00072
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+1. Some regions have very large data for analysis such as meteorology (weather science), genetics (genomics), complex physical simulations, and biological and environmental research [2].
+2.Low storage cost laws that require the continuation of the data in the database to track criminals, vandals and intruders [2].
+3. The advent of Internet technology (IoT), which allows all devices to communicate and interconnect Internet technology and new data production, doors and windows and walls and refrigerators and everything at home connected to the Internet and interact with it [2].
+4. The emergence of social networks (MySpace, Facebook, tweeter and Google) that send large amounts of data over time and various bodies [2].
+III. RELATED WORKS
+1) In 2012, by Abdullah Farhan Mahdi [6] Since On Line Analytical Processing (OLAP) is essential in decision- making He built a model for distributing information to several computers linked to a network using the fragmentation algorithm and conducted a query on these computers, the findings resulted in the velocity of complicated issues being implemented in a lot of relative time [6].
+2) In 2015, Jie Songa, Chaopeng Guoa, Zhi Wanga, YichanZhanga, Ge Yub and Jean-Marc Piersonc [7] this paper presents Hadoop based Olap (HaoLap), an OLAP system for big data. designed an OLAP based on hadoop and applied several algorithms to each particular work to perform roll up operation on dimension hierarchy using the dimension coding and traverse algorithm then stored the dimensions and measurements using the partition and linearization algorithm. Results with efficient performance in OLAP and complex query [7].
+3) In 2017, Xiaolei Li, Zhenyu Tu et al., [8] By using big data analysis to enhance performance and enhance rates, new company opportunities can be acquired. The data analysis was introduced using industrial enterprises and the off-line data reference model library were developed. By using Spark to introduce the web application that is used with the production of Real Time [8].
+4) In 2017 Sonia Ordoñez Salinas and Alba Consuelo Nieto Lemus [9] Opinions differed regarding the warehouse data and large data some concluded the disappearance of the repository data with the existence of large data, while others completed the integration of the two by discovering the points of convergence and difference between them and the work of joint tasks [9].
+5) In 2018, Konstantinos Vassakis, Emmanuel Petrakis and Ioannis Kopanakis [10]. The huge increase in data varies from one generation to another. In the previous generation, the increase of industrial companies, people and advanced technology led to competing companies among them, but now the increase is the result of the Internet and social networking sites that are growing rapidly [10].
+IV. THE PROPOSED SYSTEM
+The proposed system illustrates the main steps from data collection to results obtained using the following algorithms and techniques .
+A. The Role Of Metadata
+ Metadata are an effective task of managing and organizing data while storing it because of the lack of
+effective mechanisms such as metadata. Metadata refers to
+data that describe other data. It adds more organization to
+the data structure, such as the database, and also describes unstructured data such as maps and media Multiplayer [11].
+B. A Technique for Converting Unstructured Data to Structured Data using Metadata approach It is difficult to find a tool for dealing with non-
+structured data that can store and retrieve data that are
+generated in a structured database. The following steps will
+be taken to access non-structured data in the handwriting
+form.
+Algorithm1 for Converting Unstructured Data to Structured Data using Metadata approach
+Inputs: unstructured Data. Outputs: structured Data.
+_____________________________________________ Start
+Step1. Input unstructured data (with various sources). Step 2. Select an affected parameters (features).
+Step3.Using these features to create structured metadata using data modeling (relationships) for this purpose.
+Step4.Apply (Classification or Clustering task) or any mining or statistical methods (machine learning) for an
+efficient accuracy(quality) results
+Step5.Data Visualization. End.
+C. Distributed Processing.
+ The distributed file system is a major challenge in dealing with large data as it uses several computers connected to each other using any available networks and in the case of a specific query will be sent to these computers and respond to rapid response and thus saves time in retrieving data [6].
+1. Data Fragmentation
+ To handle large data, the data are fragmented either horizontally or vertically according to the Fragmentation algorithm to several computers and then dealing with the architecture of Client - Server in the need for a specific
+complex OLAP [6] .
+2. Replication of data
+ Replication is one of the technologies used to copy the data to more than one site to maintain in the case of loss of data from the designated place because it is located in the other and used with the process of fragmentation as integrated work in the architecture of Client -Server therefore, the data are stored more accurately and provide more data and give a detailed report of anything whether homogeneous or not [6].
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+358
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
+
+3) Network Regulation
+ Distributed data operation within the network environment, where possible, should be within the area of building (LAN) or city(MAN). Implementation of the system was based on an internal network (LAN) within organization building. The work will be in the architecture Client -Server [6].
+D. Data Quality
+ Quality is a smart tool for applying sustainable development for all parts of the system at any organization. This is the application of development methods to ensure quality, improvement, sustainability and implementation at high level in practice, operations and performances. [12].
+• General Model of Evaluation
+The statistical models are used to evaluate the highest educational institutions based on standard model. The model is used to evaluate the faculty members in these institutions. The faculty members model is based on five measures and each measure is based on standard ratio with the final evaluation measure obtained from the sum of all the five measures with a rate of 100%. These measures are (Scientific Performance with a rate of 35%, Teaching Efficiency with a rate of 25%, Educational Performance with a rate of 10%, Personal Conduct with a rate of 20%, Foundation Performance with a rate of 10). The performance of the scientific colleges is compared with the performance of the humanism colleges depending on colleges evaluation results with statistical forms using the (T-test) for comparison and the (COV) to know the homogeneousness between the scientific colleges and the humanity colleges[12].
+• The Arithmetic Mean
+ Using (1) and the percentage law we can be find the final average to evaluate the university then to the college and then each person in this college [12],
+ ¦n X
+X = i=1 i (1)
+n
+To compute the arithmetic mean we use (1) Where n is the size of sample
+The arithmetic mean (or average) of the squared deviation (Xi −X)2 is called the variance. The variance denoted
+symbolically by s2 . Its formula is:
+ ¦n X −X)2
+= i=1 ( i (2)
+s2
+n−1
+Where n is the sample size.
+ The square root of the difference is the standard deviation, as shown in (3). It is used to determine the dispersion of the performance of scientific colleges and the dispersion of the performance of colleges of humanity.
+The (S) symbol refers the square root of standard deviation
+of variable x .[12].
+ ¦n (Xi −X)2
+s = i=1 (3)
+n−1
+• Statistical Comparison Functions
+ Statistical comparison has several functions. Here, two comparisons of statistical comparisons were performed on the basis of each of the two components between the performance of comparative scientific colleges and the performance of humanitarian colleges in the following form:
+A. T-test
+ T-test is used to compare between two separate accounts mediums. Its mathematical formulations are illustrated in (4) It depends on the mean and variance of the two sets. Also it brings on a degree of freedom (df) and identify the moral (.), in order to find ( t scheduled ) which can be found from the intersection of (df) with (.)[12],
+(X −X )−(μ −μ )
+t = 1 s2p 2 1 + 11 §¨2 (4) ·
+
+ n1 n2 © ¸¹
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+359
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
+
+ ¦n By sample size(n) is the sum of all measurements where X1 and X2 = means of samples 1 and 2
+aatr hevn epedrreaxaitsvgise•eendrTatiehsgpdeee b.rVvsyaiaolrtunihae.ensI ctcteihosaatrihntasecdt cmethereen(dtSSriaa2taln)n.oSdIfqat rutidhas erDecsomevmoaiftaphtduieeotmevniafatrtioicomsn sat nhodef populationsn11 masa2nn2edda0= nsn2s2. ==1 sstiaz1ne)dssao1rdf + sdae(mvnip2alteiso n1sao2nfds2amples 1 and 2 Ti=h1e average or the percentage is called the arithmetic (μ1 −μ ) = hypothesized difference between the
+Xi
+ The variance is a measurement for variation of the data
+scientific (2) which represents the variance to a sample[12]. ( n − 2 − 1 ) s 2
+Deviation is the difference between an individual data with p n +n −2
+value xi and the mean X and, it is called the deviation of 1 2
+Xi s2 from X , that is deviation = Xi −X and df = n1 +n2 −2 , Confidence interval for μ1 −μ2
+1 + 1 (X1 −X2) ±tσ / 2 s2p (n1 n2 )
+ With σ =(1_ Confidence coefficient).
+there is a difference between the average of the two samples if the t calculated is greater than the t scheduled. Otherwise, there is not a difference between the average of the two samples if the t calculated is lower than the t scheduled.
+B. The Coefficient of Variation
+ Equation (5) is a statistical function to compare between two different samples based on standard deviation. It is used to find out how distortion data is in the data, where the higher the data indicates that the data is dispersed,
+indicating that the data is more homogeneous and vice Fig.3. the original data set.
+versa.
+ To handle large data, you can defragment vertically by the following example "SELECT * FROM item Where
+c.v = s × 100 item_ quentety = 209"; see fig.4. (5)
+X
+V. THE RESULTS AND DISCUSSION
+ In this section , the execution of the proposed algorithms for converting unstructured data to structured data using metadata ,distributed processing(fragmentation), and data quality, which helps decision makers to obtain good results and to make the right decisions .
+A. Metadata of Sales
+ In this section of the proposed system the description of the files (tables) used in data warehouse and details of the reports again the sales system : 1. Metadata for tables that used in sales system. 2.Metadata for complex OLAP query(reports) against sales system. For example Metadata of item Table in table 1.
+TABLE.1. METADATA OF ITEMS TABLE
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+361
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
+
+
+B. Distributed processing
+• Data Fragmentation
+ To handle big data, R are the original data to be split into horizontal data (R1) or vertical data (R2) that contains sufficient data then retrieve the complex queries required from these fragments . It is possible to return the fragments to their original data by collecting them. see fig,3.
+Fig.4. Vertical fragmentation
+ And to handle large data, you can defragment horizontally by the following example "SELECT item_id, item_name, item_code FROM item”; see fig.5.
+
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
+
+Fig.5. horizontal fragmentation
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
+
+ By applying the proposed system algorithms, we found: First: Response Time of Query
+ The query response time in the OLAP and decision support systems is critical and very important. By applying distributed processing algorithms to the sales system, we concluded that when processing large data time saving (i.e. the system requires a few minutes), high quality and data retrieval speed. Therefore, the implementation of the query on the distributed processing provides us with fast response time and speeds up decision making. See fig. 6.
+00:14:24 with out dis.
+processing 00:07:12 distributed
+processing 00:00:00
+total Q3 Q2 Q1
+time
+Fig.6 . Execution time of OLAP query in Distributed processing
+Second : Evaluation of higher education institutions
+ We can apply statistical models to the big data were to be Iraqi universities and evaluated according to the standards mentioned and therefore we applied statistical models at the level of Anbar University as a sample of Iraqi universities . Evaluate and Compare Science with human Section The percentages are illustrated in table 2,3,4.
+ After taking several colleges and applying them a statistical models to five measures. The following results are illustrated in different fig.7 and fig.8.
+
+Fig.7. Rate assessment of final evaluation of the colleges
+
+Fig.8 .Rate assessment of scientific and humanity colleges
+TABLE 2. EVALUATION OF THE SCIENTIFIC SECTION WITH HUMANITIES
+
+TABLE3. A COMPARISON OF TWO SETS TO KNOW DIFFERENCE
+
+TABLE 4. COMPARED TO THE TWO SETS TO KNOW HOMOGENEITY
+
+VI. SYSTEM EVALUATION
+ The design and implementation of proposed system can
+be evaluated as: .
+1. response time: we used the proposed system to process
+large numbers of data and realized that it would take a few
+minutes or seconds to answer the complex queries.
+2. Ease of application: algorithms can be applied using any programing environment.
+3. Accuracy: the accuracy of query optimizing based on the
+selection best set of views and tables that will be used for
+creating new query by applying proposed algorithm for optimizing the query.
+ We compare this thesis results with other results based the following factors in the table 5.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+362
+Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt b/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt
new file mode 100644
index 0000000..bfd345b
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt
@@ -0,0 +1,109 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2019 IEEE 19th International Conference on Software Quality, Reliability and Security Companion (QRS-C)
+Research on Security Detection and Data Analysis for Industrial Internet
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Lin Jun
+China Electronic Product Reliability and Environmental Testing Research Institute,
+ Guangzhou, Guangdong, China, 510610 Email: linjun@ceprei.com
+Abstract— Industrial Internet platform needs to solve a series of problems, such as access of multi-type industrial equipment, multi-source industrial data integration, massive data management and processing, industrial Internet security and so on. This paper builds industrial big data analysis algorithm library based on domain knowledge modeling and big data analysis of industrial data. Through the analysis of the behavior characteristics of industrial internet network traffic data, this paper studies the method of selecting traffic characteristics of events in the industrial Internet; establishes the propagation and evolution model of security events in the industrial Internet, and builds a traceability map of security event propagation; This study combines the characteristics of large data volume and centralized control of future industrial Internet to reduce the complexity of security event detection and analysis. It has reference value for industrial Internet controller to formulate node routing strategy.
+Keywords—Industrial Internet, Future network, Big Data, Security Detection
+I. INTRODUCTION
+Industrial Internet is a name given to the current trend of automation and data exchange in manufacturing technologies. It includes cyber-physical systems, the Internet of things, cloud computing and cognitive computing[1]. It is marked by emerging technology breakthroughs in a number of fields, including robotics, artificial intelligence, nanotechnology, quantum computing, the Internet of Things, the Industrial Internet of Things, fifth-generation wireless technologies (5G), additive manufacturing/3D printing and fully autonomous vehicles.
+The fourth wave of the industrial revolution is expected to see the heavy implementation of several emerging technologies with a high potential of disruptive effects [2oÀ3].
+There are many challenges in implementation of Industry Internet, for example: IT security issues, which are greatly aggravated by the inherent need to open up those previously closed production shops. Industrial Internet need to maintain the integrity of production processes. Industrial Internet need to
+Liu Lan *
+College of Electronic and Information, Guangdong Polytechnic Normal University,
+ Guangzhou, Guangdong, China, 510655 Email: hust_ll@126.com
+avoid any IT snags, as those would cause expensive production outages. And Cloud and data security is a big challenge of Industrial Internet. There are many companies like Symantec, Cisco, and Penta Security have already begun to address the issue of IoT security.
+Industrial Internet is the focus of industrial development, and the control system is at the core of the whole industrial system. After the combination of industrial system and Internet, the system architecture has changed from controls-centered to industrial big data as the core [4]. Changes in the industrial Internet architecture have made information and data security very important. Based on the current situation of global industrial Internet development, this paper analyzes the new demands of industrial Internet development on network, studies the collection and integration of industrial big data, and analyzes the data processing and security problems facing industrial Internet in the future. Through the pilot experiments in automotive electronics, 3C manufacturing and other industries, it provides some reference for the future development of industrial Internet network architecture.
+II. BACKGROUND AND RELATED WORK
+Domestic and foreign researchers attach great importance to the research and application deployment of new technologies and networks, and actively explore the use of IPv6, Internet of things, software-defined network (SDN), 5G and other technologies to build industrial Internet that meets the requirements of high reliability, low delay and wide coverage. Among them, the future network data analysis and security research for the industrial Internet is an important direction that needs attention [5-6].
+The Industrial Internet requires large-scale network infrastructure to provide support, and data-driven network architectures provide possible solutions. For example, in [4], a new network architecture consisting of data plane, control plane, information plane and market plane is proposed, which replaces state complexity with computational complexity. Support data selection through data intelligence, solve
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+978-1-7281-3925-8/19/$31.00 ©2019 IEEE 466
+DOI 10.1109/QRS-C.2019.00089
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+problems that are difficult to optimize in the network through data association analysis, and improve network service quality.
+For the heterogeneity of physical implementation technologies and the massive data in the industrial Internet, it is necessary to provide the ability to detect, receive, transmit, and process large amounts of data. In order to realize data processing between heterogeneous networks, a unified interoperability model is needed. Virtualization technology and SDN technology provide ideas for the unified optimization, control, and deployment of heterogeneous network resources [7].
+Industrial Internet is faced with more complex security issues. We need to combine the industry domain knowledge to study new security protection mechanisms suitable for the development of industrial Internet. For the security protection of industrial Internet, more research and exploration pointed out that the typical cyber-physical-system (CPS) architecture supporting Industry 4.0 can be represented by a layered 5C model [8], they are the connection level, Data to information conversion level, cyber level, cognition level, and configuration level. According to the 5C model, the Industrial Internet needs to support flexible devices and sensor networking, real-time reliable information transmission, and efficient big data storage analysis. For the future network security of industrial Internet, it is mainly divided into five aspects: equipment security, network security, control system security, platform security, and data security. The industrial Internet needs to comprehensively analyze and process the big data traffic of heterogeneous systems from five aspects, realize traceability analysis of abnormal/aggressive behaviors, and timely discover abnormal behaviors and alarms in the network. Take appropriate security measures for each level in the platform.
+III. RESEARCH ON DATA ANALYSIS OF INDUSTRIAL INTERNET
+Based on the industrial Internet network data, this paper combines large data analysis, cloud computing and edge computing to carry out data collaborative analysis of intelligent equipment, forming an overall solution of network manufacturing and industrial Internet, solving the real-time, reliable and safe problems of intelligent manufacturing field network. Research on key technologies such as abnormal product state anomaly detection, trend prediction and fault diagnosis, including heterogeneous multi-source mass industrial big data analysis technology and industrial data security analysis technology. The system framework is shown in Fig 1.
+1. Heterogeneous multi-source industrial big data acquisition technology based on CPS
+To deal with the huge amount of data generated by the heterogeneous industrial Internet equipment, and to analyze and deal with the large amount of network industrial data, these are all problems that need to be considered in the development of industrial Internet. We need to build an industrial monitoring system oriented to the big data environment, analyze and
+coordinate all kinds of heterogeneous and industrial big data, adjust corresponding management and production strategies according to the results, and make the overall industrial network adapt to the dynamic and overall requirements of the big data environment.
+Starting from equipment automation and product intelligence, we put forward a heterogeneous terminal architecture integrating distributed perception and reliable transmission, transformed various intelligent equipment required by production, and established a CPS network system. By building a more accurate and efficient data acquisition system, we can comprehensively collect industrial big data and conduct real-time production monitoring.
+Realizing the intercommunication of numerical control equipment is the core of the intelligent factory. We realize the data collection of distributed network of numerical control equipment, robots, automatic production lines and other digital production equipment through the Internet technology based on IoT, industrial Ethernet, Zigbee >* Bluetooth and other network technologies. The data acquisition module supports connecting the equipment of different interfaces (such as RS232, RS422, RS485, RJ45, etc.), different communication protocols (TCP/IP, wireless, etc.), different control systems (such as Fanuc, Siemens, Mitsubishi, Heidenheimer, Mazak, Fagor, Agie and other CNC equipment or PLC equipment control system) into a network, and realizing real-time acquisition of equipment status. For machine tools with network CARDS, we can directly collect the real-time status of the machine, program information, the number of pieces of processing, speed and feed, alarm information and other rich information, and collected into the database for further processing.
+2. Industrial Data Modeling and Big Data Analysis Technology Based on Domain Knowledge
+Spark, Hadoop, Storm and other big data frameworks are widely used in batch and stream processing of massive data. Various machine learning algorithms such as decision tree learning and Bayesian learning, especially artificial intelligence algorithms represented by deep learning and transfer learning, are becoming effective tools for industrial Internet to solve diagnosis, prediction and optimization problems in various fields.
+After data collection, merging and cleaning of industrial Internet data, part of redundancy is removed. However, for the whole industrial Internet system, it can only be called initial data. The core data that really needs to be found can be obtained through correlation analysis based on the entire network topology environment, the time and frequency of events, and so on.
+We use artificial intelligence algorithms such as machine learning to achieve clustering, correlation and predictive analysis of historical data, real-time data, and time series data. We have accumulated some experience in our previous work
+[9].
+In the process of industrial big data processing, we build the industrial big data algorithm library. Through deep
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+467
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply.
+
+knowledge of the physical, chemical principles, processes and manufacturing related to the field, the company meets the high confidence requirements of industrial data.
+Heterogeneous multi-source Industrial Devices IOT ZigBee TCP/IP Bluetoot Wireless PLC
+Raw Data Äfrom different Industrial devicesÅ
+
+
Industrial Data Integration Industrial Data Extraction
+Core Data (Standardized)
+Filer; Aggregation; Correlation; Normalization
+Industrial Data Analysis
+Machine learning\Statistics\ Data Mining
+
+MovingAVG ExpSmooth
Copula, trend analysis.
Inter- related rules
+Domain-Knowledge DB
+Automobile 3 Electronics factory C factory
+Application and Testing
+ Fig 1. Industrial Internet data and security analysis framework
+The data analysis library uses analytical models suitable for R language and Spark Mlib, such as Copula (commonly used for risk analysis), ExpSmooth (exponential smoothing model, which is a more general predictive model), MovingAVG
+(moving average model, commonly used for product demand growth prediction) and Trend (trend analysis) and so on. In addition, there are early warning prediction and rolling prediction services. Visualization technology is used for multi- dimensional analysis and reasoning interpretation to realize visual display of analysis results. According to different scenarios, different analysis methods can be selected to support general analysis interfaces including SQL and Restful services. We study basic domain knowledge and model libraries, maintain data mining analysis programs and model algorithms, and save models and algorithms for easy recall.
+IV. INDUSTRIAL INTERNET SECURITY MODEL AND ANALYSIS TECHNOLOGY
+In the future network, we use the characteristic data found by the previous research steps to analyze the traffic data in the network nodes and reconstruct the path of network attack. In the process of analyzing the network data packets, the traceability map is constructed according to the relevant path information, and the location of the malicious code is speculated and the attacker is found. At the same time, the spread of network malware on the Internet is a dynamic complex network challenge.
+The development of the industrial Internet puts higher demands on network management and network security. However, the traditional network has high hardware coupling and is difficult to expand. It cannot adapt to the changes of the industrial network topology, and it is difficult to meet the flexible and customized requirements of industrial applications. The core idea of SDN is to decouple the control plane and data plane of the network device, and the control function is completed by the controller that masters the global information of the network. With its simple network architecture and strong compatibility, SDN has not only received the attention of academic circles, but also the support of network equipment manufacturers, and has become the focus of research in the network field.
+The flexible configuration of the SDN controller is the future development direction of the industrial Internet. Due to the separation of SDN network control and forwarding, loopholes caused by various applications are inevitable. Security issues such as malicious code and DDOS attacks are also faced by the future Industrial Internet. We study the malware traffic characterization model in the Industrial Internet. Through the traffic collection and feature analysis of the industrial Internet flow table data, the matching classification algorithm is found to accurately discover various malicious attacks. We also study the sampling scheme of SDN packet attack detection in the industrial Internet environment. These studies provide a good reference for dynamic security protection under the industrial Internet.
+1. Research on dimension reduction method of industrial internet traffic
+In the future industrial Internet, key data monitoring can be performed at each node according to the characteristic difference between different data packets of the network node, and the data packet matching the feature value is given a
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+468
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply.
+
+response, and the transmission path of the corresponding data packet is obtained. Realize network data traceability. Since the future network is based on flow tables, the flow table can be used as a matching rule for data packets. As the flow table design supports various protocols, the matching is more granular, and the feature values are also increased. Previous studies have shown that most classification or clustering algorithms are not suitable for a large number of high- dimensional sample sets, and cannot quickly complete the determination of large-scale unknown malicious code. We believe that feature selection is an effective method for secure data preprocessing. By reducing the dimension of traffic characteristics, the complexity of security association analysis can be reduced. We pay attention to the application of feature selection method in future network switch traffic data. We use Fisher, ReliefF, mRMR, InfoGain, CFS, LVF and other feature selection methods to sort traffic characteristics and perform comprehensive analysis according to different feature selection algorithms. Effective traffic characterization data is used to build the next model.
+2. Research on Optimal Feature Subset and Classification Algorithm Selection of Industrial Internet Security Events
+We study the matching degree of different feature selections on algorithm running time and different feature selection methods and classification algorithms. There are many reasons for abnormal traffic, such as DDOS attacks, witty worms, slow scans, etc., which have different performances in traffic characteristics. This project intends to separate the first 8-12-dimensional feature sequences obtained by Fisher, ReliefF, and InfoGain. Combined with different depth learning algorithms, the accuracy of the classification results is calculated, and the best eigenvalues of different types of security event detection and analysis are found.
+3. Research on the provenance tracking model of security events for the future industrial Internet [10]
+This study establishes the future industrial Internet model, considering the network subnet as a community, the subnet is a static community, and the subnets are dynamic communities. By analyzing the impact of node mobility between communities on the infection and outbreak time of security events on the source and destination subnets in different network models. In the mobile environment, the influence of the spread of malicious code on the evolution of the network is studied. Based on this model, the trace path of the security event is found by constructing the traceability map. In this way, the administrator can analyze each event on the propagation path to provide a theoretical basis for the control strategy of the industrial internet.
+4. Research on Attack Packets Sampling Strategy in Industrial Internet Environment Based on Game Theory
+We design and simulate an Industrial Internet packet sampling strategy, using zero-sum game and analyzes the security of multiple Industrial Internet topology networks. The Industrial Internet packet sampling problem is modeled as a zero-sum security game, in which both attackers and defenders
+participate, and the importance of each point is quantified into the income value. The income of the attackers and defenders are determined according to the income value. Under the knowledge of incomes of attack and defense, we determine the Industrial Internet topology with the highest security performance and security defense strategy.
+V. CONCLUSION
+Based on the design concept of Industrial Internet and future network, this paper uses the efficiency of deep learning algorithm to analyze heterogeneous data processing and security analysis of industrial internet, and realize data propagation model and event detection method in industrial internet.
+We collect industrial data from heterogeneous multi- sources, integrate, clean, and fuse data from data modules and acquisition modules of the Industrial Internet. The project carries out modeling and big data analysis on industrial data based on domain knowledge, and establishes the industrial big data algorithm base. We design professional knowledge acquisition, representation and association methods, in-depth mining domain-related knowledge; By analyzing the traffic characteristics of industrial Internet, the paper studies the selection method of traffic characteristics. Establish the event propagation and evolution model in the future industrial network environment, and build the traceability diagram of security event propagation; In the research process, we proved the effectiveness of the project method through detailed analysis and test application examples, and verified it in automobile electronics and 3C manufacturing industry, so as to accumulate application data for data analysis and network security monitoring under the future industrial Internet architecture.
+Acknowledgements
+This research is supported by Special project for research and development in key areas of Guangdong Province (2019B010121001),Guangdong Provincial Department of Edu cation Innovation Project(2016KTSCX078)
+REFERENCES
+[1] The new industrial revolution[R/OL].[2019-03-7]. https://en.wikipedia.org/wiki/Industrial_Revolution
+[2] Manekar A K , Pradeepini G . Cloud Based Big Data Analytics a Review[C]// International Conference on Computational Intelligence & Communication Networks. IEEE, 2016.
+[3] Lee J , Bagheri B , Kao H A . A Cyber-Physical Systems architecture for Industry 4.0-based manufacturing systems[J]. Manufacturing Letters, 2015, 3:18-23.
+[4] Yin H , Jiang Y , Lin C , et al. Big data: transforming the design philosophy of future internet[J]. IEEE Network, 2014, 28(4):14-19.
+[5] Sarkar S , Chatterjee S , Misra S . Assessment of the Suitability of Fog Computing in the Context of Internet of Things[M]// The clash of cultures :. Heinemann Educational Books, 2015.
+[6] Kreutz D,Ramos F M V,Verissimo P E, et al. Software-Defined Networking: A Comprehensive Survey[J]. Proceedings of the IEEE, 2015, 103(1):14-76.
+[7] Hu F . Network Innovation through OpenFlow and SDN: Principles and Design[J]. Crc Press, 2014.
+[8] Machii W , Kato I , Koike M , et al. Dynamic Zoning Based on Situational Activate for ICS Security[C]// Control Conference. IEEE, 2015.
+[9] Lan L , Jun L . Some Special Issues of Network Security Monitoring on Big Data Environments[C]// IEEE International Conference on Dependable. IEEE, 2014.
+[10] Lan L, Ryan K. L.K, Guangming R et al. Malware Propagation and Prevention Model for Time-Varying Community Networks within Software Defined Networks. Security and Communication Networks [J].
+2017. https://doi.org/10.1155/2017/2910310
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+470
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt b/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt
new file mode 100644
index 0000000..6eb5cc8
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt
@@ -0,0 +1,151 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2020 IEEE International Conference on Software Architecture Companion (ICSA-C)
+A Model-Driven Architectural Design Method for Big Data Analytics Applications
+Camilo Castellanos∗, Boris Perez´ ∗†, Dar´ıo Correal∗ Carlos A. Varela
+∗System Engineering and Computing Department Computer Science Department University of Los Andes, Bogota,´ Colombia Rensselaer Polytechnic Institute, Troy, NY, USA
+Email: cc.castellanos87, br.perez41, dcorreal@uniandes.edu.co Email:cvarela@cs.rpi.edu †Department of Systems
+Francisco de Paula Santander University, Cucuta,´ Colombia
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract—Big data analytics (BDA) applications use machine learning to extract valuable insights from large, fast, and hetero- geneous data sources. The architectural design and evaluation of BDA applications entail new challenges to integrate emerging machine learning algorithms with cutting-edge practices whilst ensuring performance levels even in the presence of large data volume, velocity, and variety (3Vs). This paper presents a design process approach based on the Attribute-Driven Design (ADD) method and Architecture tradeoff analysis method (ATAM) to specify, deploy, and monitor performance metrics in BDA applications supported by domain-specific modeling and DevOps. Our design process starts with the definition of architectural drivers, followed by functional and deployment specification through integrated high-level modeling which enables quality scenarios monitoring. We used two use cases from avionics to evaluate this proposal, and the preliminary results suggest advantages by integrating multiple views, automating deployment and monitoring compared to similar approaches.
+Index Terms—Software architecture, Attribute-Driven Design, ADD, ATAM, Big data analytics deployment, DevOps, Domain- specific model, Quality Scenarios
+I. INTRODUCTION
+Big data analytics (BDA) applications use Machine Learn- ing (ML) algorithms to extract valuable insights from large, fast and heterogeneous data. These BDA applications require complex software design, development, and deployment to deal with big data characteristics: volume, variety, and velocity (3Vs) while maintaining expected performance. BDA develop- ment involves three knowledge domains: business, analytics, and technology. In the business domain, business users define business goals and quality scenarios (QS) to drive analytics projects. In the analytics domain, business goals are translated into specific analytics tasks by data scientists. In the tech- nology domain, architects make decisions in terms of tactics, patterns, and deployment strategies addressing QS. The current design approaches do not address this multi-domain nature and complexity involved in BDA application development which frequently leads to delayed deployments [1]. Due to the lack of methods and tools to enable integration and alignment of multiple domains, BDA development presents a costly
+The authors would like to thank Amazon Web Services educational research for granting us their cloud resources.
+transition between development and production environments (“Deployment Gap” phenomenon [1]).
+ACCORDANT [2] is a Domain-Specific Model (DSM) approach to formally specify, develop, deploy, and monitor BDA solutions bridging the gap between analytics and IT do- mains. This paper proposes an extension of the ACCORDANT Method by including architectural inputs (drivers) and aligning to the Attribute-Driven Design Method [3] (ADD 3.0), and to promote the architecture testability following evaluation meth- ods such as ATAM (Architecture tradeoff analysis method) [4]. The proposed method is a model-driven approach that allows us to design, assess, and deploy integrated BDA applications based on architectural drivers: quality scenarios, constraints, tactics and sensitivity points. This proposal was validated with two use cases from the avionics field by designing functional and deployment models, and assessing performance QS in distributed batch and micro-batch processing contexts. The contributions of this paper are: 1) A DSM method to design and evaluate BDA architectures aligned to drivers thus accelerating iterative development and deployment. 2) Three integrated domain-specific languages (DSLs) to specify architectural inputs, functional and deployment view. 3) The experimentation of this proposal on two avionics use cases using different deployment strategies and QS.
+The rest of this paper is organized as follows. In Section II describes the background. Section III reviews related work. Section IV details our proposal. Section V describes the ex- perimentation. Section VI reports preliminary results. Finally, Section VII summarizes the conclusions and next steps.
+II. BACKGROUND
+A. Software Architecture Design
+An architecture description is composed of architectural views to address different concerns, and these views are built based on the collection of patterns, templates, and conventions called Viewpoints. The architectural design is driven by QS and functional requirements through a systematic design method, such as ADD [3]), and it could be evaluated using methods such as ATAM [4]. ADD comprises 7 steps: 1) Review inputs (purpose, functional requirements, QS, and constraints). 2) In each ADD iteration, a design goal is defined from these
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+978-1-7281-4659-1/20/$31.00 ©2020 IEEE 89
+DOI 10.1109/ICSA-C50368.2020.00026
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+inputs. 3) Choose systems elements to refine. 4) Choose design concepts to satisfy the selected drivers. 5) Instantiate architectural elements and define interfaces. 6) Sketch views and record design decisions. and 7) Analyze current design and review goal achievement and design purpose, and start a new iteration (from step 2), if selected drivers are not satisfied.
+B. Infrastructure as Code and BDA Deployment
+Infrastructure as Code (IaC) arises from the necessity to handle the infrastructure setup, evolution, and monitoring in an automated and replicable way through executable specifica- tions. IaC promotes the reduction of cost, time and risk of IT infrastructure provision by offering languages and tools which allow to specify environments, operative systems, middleware, configurationresources and allocate them automatically. Porta- bility plays a key role to deploy, operate, and evolve BDA applications due to the wide range of BDA technologies. Hence, portable standards appear such as Predictive Model Markup Language (PMML)1. PMML models specify machine learning models and data transformations along with their metadata. The PMML standard is supported by a wide range of data science tools such as R, SAS, IBM SPSS, among others.
+III. RELATED WORK
+Several works have proposed frameworks to build and deploy BDA applications. We review and compare some of the most relevant works in Table I highlighting the important features. In the analytics domain, we compare if they use separation of concerns (SoC), cross-industry application (CI), and support of technology-neutral models (TNM). Regarding software architecture concepts, we include: QS specification (QSS), functional (FV) and deployment (DV) views, tactics (AT), and target-technology assignment (TTA: predefined tech- nologies (P) or extensible code generators (C). Considering DevOps practices, deployment specification (DS) defines if only a number of instances (I) per component or a whole deployment diagram (D) can be described. Finally, practices as continuous deployment (CD), QS monitoring (QSM), and self-adaptation (SA) support IT operations.
+Some works have presented DSM to model analytics func- tions, however, they do not tackle architecture concepts and deployment considerations because they are only focused on functional definitions. Lechevalier et al. [5] introduce a DSM framework for predictive analytics of manufacturing data using artificial neural networks to generate analytics models. Sujeeth et al. present in [8] OptiML, a DSL for machine learning which describes analytics functions using a statistical model that covers a subset of ML algorithms, this analytics functions are analyzed and optimized before the code generation.
+In contrast, we found another group of studies interested in infrastructure concerns of BDA applications leaving aside their functional components. Gribaudo et al. [6] propose a mod- eling framework based on graph-based language to evaluate the system’s performance of running applications that follow
+1http://dmg.org/pmml/v4-3/GeneralStructure.html
+the lambda architecture pattern. Huang et al. [7] introduce a model to design, deploy, and configure Hadoop clusters through architecture metamodel and rules, which describe BDA infrastructure and deploy automation.
+A final group of works combines functional definitions and deployment specifications. QualiMaster [9] focuses on the processing of online data streams for real-time applications such as the risk analysis of financial markets regarding metrics of time behavior and resource utilization. QualiMaster aims to maximize the throughput of a given processing pipeline. Fastscore [10] is a commercial framework to design and de- ploy analytics models. Analytics components are convention- ally developed using a determined programming language or technology-neutral models, and once imported to the platform, they can be connected to data inputs and outputs. SpringXD
+[11] is a unified, distributed, and extensible system for data ingestion, analytics, processing, and export to simplify BDA development and deployment. Finally, the DICE project in
+[12] presents a DSM offering big data design that comprises data, computation, technology-frameworks, and deployment concepts to design and deploy data-intensive applications. DICE proposes a model-driven approach to develop applica- tion models that are automatically transformed into IaC.
+IV. THE ACCORDANT METHOD
+This proposal aims at offering a high-level approach to design BDA solutions starting from architectural artifacts, instead of source code. Specifically, we propose an architecture design and development method based on ACCORDANT [2] framework to deal with architectural drivers, functional, and deployment views. Our proposal comprises a design and deployment method, and its underlying metamodel. This metamodel extends that proposed in [2] by including archi- tectural inputs and serverless deployments. Fig. 1 depicts the ACCORDANT Method steps, which specializes and integrates ADD and ATAM concepts in the BDA domain.
+The steps performed in the ACCORDANT framework are framed in solid lines, while the steps made with external tools are in dotted lines. ACCORDANT is iterative and composed of seven steps: 1) Elicitation of drivers (business goals, QS, and constraints) by business users and architects. 2) The data scientist builds and data transformations and analytics models (exported as PMML files) addressing the business goals. 3) The architect designs the software architecture in terms of functional view(FV) and deployment view(DV). FV makes use of PMML models to specify the analytics components’ behavior. 4) FV and DV models are interweaved to obtain an integrated model. 5) Code generation of software and infrastructure is performed from integrated models. 6) The code generated is executed to provision infrastructure and install the software. 7) QS are monitored in operation, and new design iterations can be made to fulfill the drivers.
+A. Architectural Drivers Elicitation
+According to ADD and ATAM, architecture design and evaluation are driven by predefined quality scenarios (QS)
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+90
+Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply.
+
+TABLE I
+RELATED WORK
+
+Work
SoC
Busin
ess(Analytics)
Softw
areArch
itectur
e
De
vOps
CI
TNM
QSS
FV
DV
AT
TTA
DS
CD
QSM SA
Lechevalier et al. [5]
+Gribaudo et al. [6], Huang et al. [7] OptiML [8]
+Qualimaster [9]
+FastScore [10]
+SpringXD [11]
+DICE [12]
C
+C P C
D
+I I D
ACCORDANT
C
D
+Fig. 1. ACCORDANT Method Overview
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+91
+Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply.
+
+which must be achieved through design decisions compiled in well-known catalogs of architectural patterns and tactics. QS and tactics are inputs of the architecture design, therefore we include these initial building blocks in the ACCORDANT metamodel along with other concepts like constraints. Fig. 2 details the main input building blocks grouped by a (Project) which contains the elements required to start the architec- tural design: QS (QScenario), Analyzed QS (AnalyzedQS), SentivityPoint and Tactic. A QScenario determines a quality attribute requirement for a specific Artifact. Thus, for instance, a QS could be defined as “latency< = 3 seconds for an artifact (software component or connector). A QS is analyzed through a AnalyzedQS, and sensitivity points. A SensitivityPoint is a decision’s property (a set of elements and their relationships within architectural views) that is critical for achieving the QS, and that such decision is the application of a Tactic to a specific application context. Finally, Constraints restrict architectural decisions, e.g. mandated technologies, vendors, or processing models. This step covers ADD’s steps 1 and 2.
+B. Analytics Model Building
+The data scientist build and evaluate data transformations and analytics models using data science tools, which are inde- pendent of ACCORDANT. This approach decouples analytics models and software architecture supported by the portability given by PMML format, but also it enables us to offer an integrated multi-domain framework.
+C. Software Architecture Design
+Once drivers are defined in step 1, architecture is designed in the step 3 and expressed on the views instantiating tactics
+
+Fig. 2. Excerpt of Architectural Inputs Metamodel.
+in a concrete application. These decisions are associated via SensitivityPoints, and they will be evaluated against the initial QS to validated whether the architecture is achieving its goal. This step spans from steps 3 to 6 in ADD.
+Functional View allows us to design analytics pipelines in terms of ingestion, preparation, analysis and exporting building blocks. FV specifies functional requirements of the analytics solution, and the constructs are described in a technology- neutral. FV is expressed in a component-connector model. Sensitivity points can be associated to components and con- nectors to represent where architectural decisions have impact regarding the QS. Component metaclasses are specialized in Ingestors, Transformers, Estimators and Sinks. Estimators and Transformers are the software component realizations of
+PMML predictive models and data transformers respectively. A Component exposes required and provided Ports. Connec- tors metaclasses transfer data or control flow among compo- nents through an input or output Roles. A set of connector types are defined: Procedure Call, Event, Stream, Adaptor, Distributor and Arbitrator.
+Deployment Viewpointincludes DevOps practices starting with the specification of how software artifacts are deployed on a set of computation nodes. DV metamodel comprises Pod, ExposedPort, and Deployment metaclasses to operationalize BDA applications. A FV model can be deployed in different DV models either to use a different strategy or to test the fulfillment of predefined QS. DV contains Devices, Services, Deployments, serverless environments (ServerlessEnv), and Artifacts. Sensitivity points can be assigned to Deployments and Artifacts to map critical architectural decisions in the DV. Devices (physical or virtual), Pods, and ExecEnvironment) constitute the main elements to provision virtual machines or containers-based infrastructures. On the other hand, Server- lessEnv element describes a computing environment in which the cloud provider dynamically manages the allocation of machine resources. Finally, Artifacts correspond to executable or deployable representations of functional elements (i.e. com- ponents and connectors from FV) which can be deployed on either execution or serverless environments.
+D. Integration, Code Generation, and Execution
+Once PMML, FV and DV models are designed and in- tegrated, code generation takes place using model-to-text transformations. Code generation is twofold: software and infrastructure (IaC) code. On the software side, each com- ponent and connector is assigned to a specific technology regarding their properties and constraints. Such assignment enables us to generate code for target technology restricted to those constraints. The analytics model’s inputs and outputs are transformed to the component’s interfaces (required and provided respectively). To monitor QS, the code generators include specific machinery at application level to measure specific metrics (e.g. response time, throughput, deadline, etc) for each artifact according to its associated QS. This allows us to reduce code for logging starting from high-level quality specifications. On the IaC side, DV model is transformed into Kubernetes’ configuration files, used to create and configure infrastructure over the Kubernetes where software artifacts can be automatically deployed using the FV-DV mappings.
+E. Solution Monitoring
+In the last step, the performance metrics of the BDA application are gathered to be compared to initial QS and evaluate the fulfillment of quality requirements. In this step, the architect has to check the outputs and to make decisions in the architectural views. This process can take several iterations, and this is the whole cycle that we expect to accelerate and using ACCORDANT. This ACCORDANT’s step corresponds to analyze drivers’ achievement in ADD (step
+7), and to analyze architectural approaches evaluated against each scenario in ATAM.
+V. EXPERIMENTATION WITH AVIONICS USE CASES
+Our experimentation aims to compare development and deployment time for each iteration with other two frameworks reviewed in Section III: FastScore and SpringXD. We chose these frameworks because they are the closest to our approach, and they support portable analytics models.
+We validated our proposal using two use cases: UC1) Near mid-air collision detection, and UC2) Near mid-air collision risk analysis. These use cases are applied to analytics models, they also illustrate BDA facets as streaming and micro-batch to deal with the velocity aspect and batch processing. More details about the use cases can be found in [13], and source code is publicly available2.
+Use case 1 (UC1) was applied in aviation safety to detect near mid-air collisions (NMAC) on different air space ranges with different deployment models while performance QS is monitored. NMAC detection comprises a pairwise compar- ison of flights to calculate location, speeds and heading to determine the risk level of NMAC. Eight-hours of data were stored in a distributed file system to be loaded by JSON reader component. This ingestor calls NMAC detector which computes the alert level. Once an alerting level is calculated for each flight pair, the results are sent to the clustering estimator to be associated with a specific cluster, and these results are stored back in the file system. This use case requires a heavy workload nature, and therefore a performance QS for deadlines lower than one hour was defined.
+Use case 2 (UC2) is a real-time application to detect NMAC within an air space range. The ingestor component consumed data through direct REST service. Flight data was pushed in a message queue to be consumed by the NMAC detector component which performed the potential collision detection to be finallystored in a relational DB through a message broker connector. It is worth mentioning that the NMAC estimator of UC1 and UC2 are the same, since its inputs, outputs, and behavior are identical, so we can reuse such functional component definition, though their deployments are different regarding the QS constraints. Given the near real-time nature of this application, latency is the critical QS.
+A. Architectural Drivers Elicitation
+The business goal is to group NMAC events to identify potential risky zones and times within specific air-spaces. A scheduled job to detect risky clusters is processed in batch every day. Fig 3 details drivers expressed using the ACCOR- DANT’s DSL. The NMACDetector component is required to have a deadline lower than 1 hour in the QS UC1 QS1. Ana- lyzing this QS, a sensitivity point (UC1 SP1) is identified to achieve the deadline metric by applying two tactics: introduce concurrency and increase available resources. These tactics will be materialized in the software architecture design.
+2http://github.com/kmilo-castellanos/accordant-usecases
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+92
+Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply.
+
+
+Fig. 3. Excerpt of Input Package Models of UC1 Using ACCORDANT DSLs
+
+Fig. 4. Excerpt of Functional Models of UC1 Using ACCORDANT DSL
+B. Data Transformations and Analytics Models
+Analytics models were trained and evaluated by the data scientist using Scikit-learn, exported to PMML, and loaded in the ACCORDANT FV model. In this case, the decision tree and K-means models will be assigned in the FV specification.
+C. Design of Software Architecture
+FV models were designed using ACCORDANT Func- tional DSL to specify a component-connector structure for each use case, Fig. 4 depicts the UC1’s FV model. Since drivers are required in FV, this package is imported us- ing the keyword use. The FV model specified four com- ponents (JsonReader, NMACDetector, NMACClustering, and HDFSWriter), and three procedure call connectors: CallN- MACDetector, CallClustering, and CallWriter which connect the components through ports. Additionally, NMACDetector uses batch processing model, and it has associated “NMAC- TreeModel.pmml” obtained in the previous step. The sensi- tivity point UC1 SP1 aligns the drivers to the NMACDetec- tor as part of the introduce concurrency tactic realization. NMACDetectorwill be translated into a distributed processing component which must be supported by the target technology.
+DV models were designed using ACCORDANT DSL for UC1 defined in the FV, see Fig. 5. Given that DV is based
+
+Fig. 5. Excerpt of Deployment Models of UC1 Using ACCORDANT DSL
+on the input package and FV model, they are imported using the keyword use. This view includes the artifacts that map connectors and components from FV to deployable elements in DV. For instance, NMACDetector(see markers A) is mapped to NMACArtifact, and deployed in SparkWEnv (see markers B). Devices and deployments were specified to support the computation requirements. For instance, deployments of Spark master and worker nodes (e.g. SparkWorkerDep) details repli- cas, pods and execution environments (ExecEnv). ExecEnv defines the docker image, resources, and ports along with the artifacts to be deployed. Finally, the sensitivity point UC1 SP1 associates the deployment SparkWorkerDep to performance QS, and the tactic increase available resources (see Section V-A) to support distributed computing over a Spark cluster.
+D. Integration, Code Generation, and Execution
+Once FV and DV models were designed and integrated, code generators produced functional code and IaC. The target technology selected was Apache Spark, so NMACDetector component implements the PMML model in a Spark driver program. The Spark program defines data input and output from the Data Dictionary and Mining Schema embedded in PMML specifications. On the other hand, the infrastructure code was generated as Kubernetes’ configuration files. Kuber- netes code was executed on the AWS cloud using Amazon Kubernetes and EC2 services. After that, the software code was installed over the cluster to operationalize the solution.
+E. Solution Monitoring
+Deadline and latency metrics for each use case were collected in operation and validated against QS defined in Section V-A. As a result, different deployment configurations were designed, deployed and monitored in each iteration to monitor the fulfillment of QS.
+VI. PRELIMINARY RESULTS
+Revisiting the related work reviewed in Section III, we have shown how the ACCORDANT Method fills some gaps
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+93
+Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply.
+
+
+Fig. 6. Development and Deployment Time for Use Case
+in BDA architecture. As presented in Fig. I, ACCORDANT follows the SoC principle using three different languages to specify domain concerns. Analytics models in ACCORDANT are cross-industry and technology-neutral. In terms of soft- ware architecture, ACCORDANT supports QS specifications aligned to FV and DV, and these models can be specified independently, but in an integrated way. Code generators promote flexibility and faster development and deployment. Respecting DevOps practice, deployment models allow us to design deployment diagrams and generate IaC to provision such resources semi-automatically. The solution monitoring is aligned to the initial QS specification and implemented by injecting logging code in the generated applications. Finally, self-adaptation is not covered in the current version.
+Regarding the development and deployment effort, Fig. 6 depicts the average times invested for UC and two devel- opment teams. These teams developed the UCs using each framework and taking drivers (QS, constraints, and tactics) and the PMML model as input. Each UC was deployed to cloud containers, and the QS monitored using the features offered by each framework. The development time using AC- CORDANT was higher (between 22.7% and 44.4%) compared to SpringXD and Fastscore, but the deployment time was significantly lower (between 50% and 81.8%) using ACCOR- DANT. The higher development time can be explained by the time required to specify architectural inputs and FV models. Besides, the current ACCORDANT prototype generates func- tional code for estimators, but ingestor, sinks, and connectors still require manual coding. Although ACCORDANT required more effort in the development phase, this effort was rewarded during the deployment phase, where infrastructure and QS- monitoring are provided automatically aligned to QS, unlike other approaches. The biggest time differences arose from UC1 that demanded more time because it included a more complex pipeline, involving two estimators. These results sug- gest ACCORDANT is more suitable for application involving multiple iterations, or in subsequent applications where reusing architectural elements can reduce development times.
+VII. CONCLUSIONS
+We have presented a design method to specify, deploy, and monitor BDA solutions. Two avionics use cases were used to evaluate our approach against two BDA frameworks. As a result, ACCORDANT has shown to facilitate and accelerate iterative deployment by offering an integrated and high-level design BDA applications by investing more effort in the design phase. In contrast, some limitations have emerged from
+experimentation. The development phase is slower than the other approaches for multiple reasons. The current version of the ACCORDANT’s prototype requires extra manual coding. ACCORDANT also requires more design details and archi- tectural inputs. These additional definitions are rewarded in consecutive iterations, so ACCORDANT is most suitable for application involving multiple iterations. Finally, our approach takes advantage of reusing architectural decisions and models, hence, first-time or one-time applications may not be benefited from our proposal.
+The next steps include a model to predict the expected performance based on FV and DV models, target technologies, and collected metrics to recommend the optimal architecture configuration given a set of drivers. Furthermore, we are developing validation rules to check correctness properties against architectural constraints, e.g. technology conformance, resource availability, and architectural mismatch, taking advan- tage of the integration among drivers, FV and DV. Finally, the experimentation has been performed using containers in the DV, but we expect to include serverless and/or fog computing deployment which can open new challenges.
+REFERENCES
+[1] H.-M. Chen, R. Schutz,¨ R. Kazman, and F. Matthes, “How Lufthansa Capitalized on Big Data for Business Model Renovation,” MIS Quarterly Executive, vol. 1615, no. 14, pp. 299–320, 2017.
+[2] C. Castellanos, D. Correal, and J.-D. Rodriguez, “Executing Architec- tural Models for Big Data Analytics,” in Software Architecture, C. E. Cuesta, D. Garlan, and J. Perez,´ Eds. Cham: Springer International Publishing, 2018, pp. 364–371.
+[3] H. Cervantes and R. Kazman, Designing software architectures: a practical approach. Addison-Wesley Professional, 2016.
+[4] P. Clements, R. Kazman, M. Klein et al., Evaluating software architec- tures. Tsinghua University Press Beijing, 2003.
+[5] D. Lechevalier, R. Ak, Y. T. Lee, S. Hudak, and S. Foufou, “A Neural Network Meta-Model and its Application for Manufacturing,” in 2015 IEEE International Conference on Big Data, 2015, pp. 1428–1435.
+[6] M. Gribaudo, M. Iacono, and M. Kiran, “A Performance Modeling Framework for Lambda Architecture Based Applications,” Future Gen- eration Computer Systems, jul 2017.
+[7] Y. Huang, X. Lan, X. Chen, and W. Guo, “Towards Model Based Approach to Hadoop Deployment and Configuration,” in 12th WISA. IEEE, sep 2015, pp. 79–84.
+[8] A. K. Sujeeth, H. Lee, K. J. Brown, H. Chafi, M. Wu, A. R. Atreya,
+K. Olukotun, T. Rompf, and M. Odersky, “OptiML: An Implicitly Parallel Domain-Specific Language for Machine Learning,” in 28th ICML, 2011, pp. 609—-616.
+[9] M. Alrifai, H. Eichelberger, C. Qui, R. Sizonenko, S. Burkhard, and
+K. Chrysos, “Quality-aware Processing Pipeline Modeling,” QualiMaster Project, Tech. Rep., 2014.
+[10] Open Data Group, “FastScore.” [Online]. Available: https://www.opendatagroup.com/fastscore
+[11] S. Anandan, M. Bogoevici, G. Renfro, I. Gopinathan, and P. Peralta, “Spring XD: a modular distributed stream and batch processing system,” in Proceedings of the 9th ACM International Conference on Distributed Event-Based Systems - DEBS ’15. New York, New York, USA: ACM Press, 2015, pp. 217–225.
+[12] M. Artac, T. Borovsak, E. Di Nitto, M. Guerriero, D. Perez-Palacin, and D. A. Tamburri, “Infrastructure-as-Code for Data-Intensive Ar- chitectures: A Model-Driven Development Approach,” in 2018 IEEE International Conference on Software Architecture (ICSA). IEEE, apr 2018, pp. 156–165.
+[13] C. Castellanos, B. Perez,´ C. A. Varela, M. d. P. Villamil, and D. Correal, “A survey on big data analytics solutions deployment,” in Software Architecture, T. Bures, L. Duchien, and P. Inverardi, Eds. Cham: Springer International Publishing, 2019, pp. 195–210.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+94
+Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt b/docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f032de85112ede2fd74a78e0b635b9b063ad95be
GIT binary patch
literal 25197
zcmeI4U2YuNmFMS~0KJBb!ZfA?kXaI?`qGP=A}NWsB+6t;Ww+pmU}lgED>EaVkx7bv
zsy$4#;
zCe^fB+U?C=xUW}X#2Txn49^#38HVL_UN4HpeRx%k${Ck$mdknb^ouX%i+a2oEt^5%
zi;wE*7kAp=i)DIdf8x)4vAZ+8(SpUaEzRLQH++=>^g+qop-ZlxnQ-`f`hzk
zit935)r;_~x(>(1vIuV{#gYqCZr$BEX%Bcdo7@jVj|2U)9>IY{b$zo82M-@U-amMF
z@Fa30yj^mr<#?bahpXB3{-j(qllnd!-4xAgUa;3uJ(-ly0=?C2R^P%VxHc?DH?w+D
zU*E&-aX6V>SF^HQ=qSfGt3~m#`crtjnvJVvcsJvLMN=*BcXyr@v+Ixb$GtGTDXIxi
z?(R&BW?3$l@$dh5Q!l6dG=P=6JBOFB1P=Z1{Xe&SxGedtY{Cd;RmCJM>w3}{t7he0
zC>MoD9gAbdteD&{tC8dl?~7@@nl1I~s>^D-wQ5j=rsjwvRH@+{19%id5zv!nHCgV3
zuw;!eTTL&^MbIk!WqiwkO8lurC3tjErsc}V>bprfMgrlczJrQ7gmDSWrf@tG)jLllq?-AU-91cJ<)`_iMlRLT
z*s>^5s&daxCsyiYWviIIr4y?V^;l;b-oQ8bQx=O6c2`Xskp~N8YPRB4)^{Vk8ke(W
zb#=d$?v?30JliB3_DO=Fvm6Qy#c24sp8CVp3=Z8^CeL{z
zS>%&2<+4QhkgQ#8xyUP?%#4cn34b_J;zmzbI3rqcQ%>fIk&SdRip{TW3eNe|tvh=k0=;9lfqpA^#F3O1kEI)289CdyGTKu-WHvkMW*>_zt6Nc^eFC}Q(
z%f`Le#l+s@>D~;3Yinuwa8%50k=^djDMxxUJU=^rcQiabd$YUqroJo2#Xazz7fLuH
zZb0*7{KNPE(u8TU3d=8N^DHZ^P16(7O
zH%q<5@m-=Bw}NGD_T~HLX$bG@$!dzlzb_~C2v`P@-WQ9i)SuqtH|-~2m!HCiM}IWi
z!_Ff$BlfDUDj5bV!h_kUnorC>B$CXYW=W|wbUvZTNN_A`Y1+eJFsq3UDZgBgOy{uW
z+BU;6%)*z+elPF>(YmK-?7bOR;yjjP+i3A{R|-+@ZYn@3a1w+pY?fDF7I*}?S--vG
zF-`%BG5!nfcYN58l$MJALfojslZOxgfp}v&$NufYnj#<%obQ+6!vRb+?3|Xx3_XL9
zMy+X2JuCOGsw)DSJ9z?Ffqe;P`d(yO%1aNnQu=rKMg|x63<1;NDFfZA)b1V*Z>T$uJnw7d&!8T7S76V|5&|s|S
zxfDzGwJ2|h!>da!P3v(vF^IxW+#0s1mg#SUMBJ@41mWTK|HZR`2E_F`dF|FF4%jl|
zYJp3TkBoX2y`i(vAgjJqNJxlc&}_EQ22CQV&aQhqW8HHLrY^|_OgWQ%8c)V0N}fV=~DR#
zV=QMQ^BeEhqm$(6`)oM%__2G@Y#{|(TP5bgH$la4J7P#3#)?&YtWKWQyizenf}_&y
zvJ(qi!dsP8+Y4^brGkZKGO@kV8}Y|6=%>=CYB)gu^(vPQsI
z-p^fi?beL30%(*YM=7^J!x$!_&X0pVGYF3!gp1^pJ&k>enqDCzX}zsyi?D0tuq`2%
zJT-$VF!H@Xz;1xhigz!1_4cWVa+0$}aRg%c13Y~U%4c4%=s
zP=R9&WWNdz#4jjS9LOcb4+ZK
zLv$Cpivldkj=Rz5*g3>royWu>*7vBiQAnjH#d;Ht;wt%l1#wx9#t8U;mN>=2!_`gPScv}MZKDv3u<~iaEgumM4|LM
z6KB8g$vd@CrmY2i(i(I=dh{M@+5#&t4=HJHNkVZRKj^14E(EmMxM!*@7Fu8j50%Ul
zXyMuF*;3JLHbAaLm9uLtFn4&8Z)SZ7#o}7We&jF3q+AgNsz1am1TvpX!OBmCyeN9f
z`R8+KYP*fVPo=Q2pkrz3rj^_=I2(!97gU;tDIOfDGBol$){bk{w{dlac}rbgtc}|Z
z)KybcwBNseW%|AZK^4tRx)vyAN!fIgb|rLjiNyg&I;4(GwxuEpgz-38ozd=0k9JD*
zHa27IoYD4e3LfFOS?j_SJ=UfX(xW=W<9VF}C*1=1~2GA&p8Hj9Cr(2rKGF3xbqIg_yNz3}Afo
zPQEJ=J>A_PsdJ!-wJj{2S@sqi}k1a>Dk9D>9jdu{zd-9dfMr4F8w))9~)i>HCxOi__!7x!n@&-SHf)L
z-&5^dNCLQ@<)X4s6=a^zRW;t-`CIA#_TewWKx5DW3Uw-%Ix{bTA|KT=0}(}Pol?hP
zD(IS`yj6Zhb-f9npFTT2eEVQ`=YM<>JS81Bt+EUrzi8JpCZ=0JQuk44>h4a$QlNp5
z18+}+5Yr9-Xi-tQfs=SR9&_tQ&lhC6(m?QHHJusi8AwJK0Bi_bZMk?$chji)DiA-T
zF65SLXqtS4o2LKLb4+$ubOnah3chRsLK7MqiZcarYt(p+i}MP@GRWys&r>XMn9329
z)bDW;NWiv=aZd8B2IoMp@Z3|8jU_LM8te4;xlD4h^DwC
zKxpNpou#KqvVUF`6YJG*xQO>+KGhErpWYA1TtKtC(=HYtP>oN>6rC_Cu=#Yy?RK|J
zJSTFTjg>DC!gq=t3mBo@HU%V)mb`rtCaMu$X}^35ThfbD2-7nb$<2xFRHU9uE0YhW
zn%!6!8_$$#J_e^ZI0JGfiX017;yGZBUGGYDg7zpL`+r&Ua#Gj85Ac8&QLh6HfoPmV
z)(%&pP;*2AiLUQ#SLRO(A~V;Sa&}uS>Y0bF=F}CiXic6Tsk~zjr>pNBKzGkY322U!
z=m%^>bC=3>E!Ggn<5@d2iC8$v|Is9NtSA+z`(>Wzyi
zR^wVzlX3AZ^4ykIpb$b26ayY_1&Q1d+SD4;1e63djr%E_p5e`{mf#t~n|lA^cdy(G
zl*xc0r`*z(Eq_EMt*&@YeB>KMDF=K=5r_te5R;9^pM(M(1`81yO2+{8v>fBZ#n4uK
z9dpGlEZ0t~k2bl+Sq{sMl~-4Gcl>4#+%0)Pj2j&4*ynb;^Zeu5DoEH;ImaN7RHDQa
z9(J~@TcKt(LiMFG@g#85^a8UZJMlQ`*mJC*d`x?YXoA|2ht)DE&7D~CV-U>qC^Ns9
zM6FFFs=a`u;LfpiJ*xewleN*Y)KIcZfa`^{y#BJDS0l;9sa{Q|s;*gBsC!b>Yq4^V
z&GA@**Y>Pt3c1WcAHoh3!`=gJlY`)cPRu68h1;edH`+xy&AmD)^`$}owp`sH+%?yW
zmddLlO{2!LEE6~Ri;lEjdgaEn25mo%1BY^sst2wpGb)JZ#&_#gp#i7B@-1&n=PoMH
z%eJMfkZPX)3+Z2^_U8s7^A(ZMlJOBxsOT=23#|e<0}BLCtb`BBKiN^1#U~x1
ze~T_UG>iT-cxMh_e{Q(@>e0)!D1RSwb;&k-O;qIDB|aV>09S
z;~$avCSo6xnwmtz#i+9Sx_g_oi>|>6GJRkKRZSpf5@d=>^(2JUtUSztQ@QOjW?oC+
zY}$^zJ9QnosCu=WgLpZR&iR>i1OK?FKBd%6S6l3nAo-kA$AL!y
z(gmki%@W7*Zt>^?w^GyBTUzBBB#4_u6I4j3830!4Gsshj{>=(w$Mk4&X?=MouEE1S
zW%SO-LvQ2wELM9toEcBHhlRF?hXMKVMH#i1j{K^cFsl&f%rM#EO=;DOz3>~#WsJjk
zPLOHvy=)jl%`g*egDZnQM(L#l643z-)P&S>@8XMxopvDKIys|Z33slt6p65WPQQRgXejO2U0Pl
zQ}wLQPcEq59K8(B&ktXpe0O&KTQ`n1GLbVv?N3q_W6E<1`wi)>AW{E_QTpU#&Y`;x
zY+=C`iedCcY-ApU&yQ(eXEm
z*DQ
zexHx@YDjWa?WQ`oN@rrBZyi^9@3WA$T|#+WK8#LMC2L8ONEQ)MA!!F7l%kMw`YXuHSzQk&**jKB44BQJGE-^M1f*MZX7Ln0y?Nwl49N?=$A
zA<3@xGGkCV3liz+6Gm}gc1E%s_(}c
zh76}R`X>SqvJ!5cGl%_g>jo=ZfT~mWr!T@ktu8@WZ
zHrJMdU9Hr@>=w@y@2OOpD^guQ?AWKZb=#a&iY2W2WBI$K=0*rqUyE?+u~gs*AW+FT
z?5ND5Ti~0ijjGeio}HN=Sd`br!nz?+2(>v(3RZ{O-Lc^7ahJ)d%F%=mUt3O9
z6Ymh@T7Y_}HkJy}XaGu=CYH4GRawd~KxK`xYa`h~9fsQSXi_u{v$tLj3NJJVp@(vI
zYNZAbwbq{mGXZXHW?S*WFkOEK6znKEgP`-W)Kvh?M^0TAsMzayv*=%KaA(hFW;8fp
z@xIoN3*_GVs3wH0Xz(end(OdgCqs8*+ahFRWp+1em7q)eT*E-AKd+cbQw=Px8t3nh
zV4mG5&S8J~?=fr7Q^3}ep!fuMIl8@8&OS-q1aV}KFrW0#Y(shkPC$hJb&S43G9w6b
z!l~F0ULKxmQw?w-S%wyF9GD^4%1+QL9pavIh7vG0qsYVM2AHMmc
zH83MrC85j%n^)?T2{KRI0j*83CLiFl7+wwc5e}-vRs{94n)oG?6dK%V#(2U6;POT)
zc{m~oO$L#?hMfDbxMvzMlq#`G`OVGJ0F?IIfT<>fb>3($87sAK_AQ7-#$lS7bjdik
zmW3l`Dph9Z)C2TKsZ!LM4!6wXdnB1Wepb~SKqNP8wqXf?jQvLMfSKphIQq4PjPx;>
zYzrXZte)C*uTgJ(O{A;&tWM>?$V)FzX`Ea)GdV=Sgi$fJx-by8vAuLR%q%mzHkl<-B!tNU~A`u*_zUx)1*TgH7jr@DKrpSEMSFwdy5kybYjy>5i!
zlf@4@wO84#22MWCD`s|4ZRUeSpI5_@slH~anQ}m0B3(DikB)}#8ECX&@rX;vOs~|b
zRy&f{PWOL){`~a#K`znolHLONY+s-H^$*|wD{9FQO^$-R$lzAyu>@5LK?y~%?Qji$y|`91Rn(@UTg*tQPaDea4iShD3_V_dKTe|)yWrH<&A3F(OAQP
zu38QewinYg-yp54qo`izAX*?FBee_wO@DLb!?)%WwWt{udQMLVkMHx|(k%2~J-e5N
zs1um{rLnx*XoiIU8ndMEvYpEnjS3A5d(KZ(uGfh^fA;$5LGqjSeo$ntJ!Wm7IC-ux
zevQ6V&~7x#nKGmwVU$TOy%b_=|tyX(ll|XZ7=BTt@OEEXlWAq-KxNVsgH{=hl6Yds`=-bmDDXAwkQ4xT1^5w#LYKD_DvpmFW?~R(hX6X}GS(XMrQ;dY2kHIMokO
zmlVPBLU}kZjYXRS1m=KT4$17DAF20Br>DV?41{Io(wbNT@^rtx)abI_WSqegHT?vi
zI&8!>5^?i3`VUeNr9Bu_)}LtR9{x6|K58M3hx$hl5H9FO!&Le{>DBnl)JRpeRtMKP
z%JHv3hhN~m_`lQo?7`1opCto6WK7ZL%cOaV(-DZ`>8*KLmV*UPo|A%QXrS~!p*X|E
zF@Z5-mU-Xl^&~B8>;5;A*v~GnTo?Q*@X~cMJ$V{NfQesLKILxKmjT&56?MV))|Yzs
zYItESme!RXt3;t!zHyjX^bycP)}~<8oh*o8Wlpb8VA-41L=R0;jaT-IrFz(CNmbiS
z%HRk*_?v6!-KHCnHlDPfB-qeKb2Dh6ziMKB#4*Q7jTD$0bA-{aIlJ--csGKULm*7@
zi3J}b78;8mIe0gG);gxhhSa_2Jf^O!l;R@5Ozmwa8BvvvQe-|N059N=EE
zP-27IYM*_(j8piBo=s3?7rh(2Qtibe8M!bQjYS*K6#J}Sr^+jNism46HXT%m9aVvw
zZJxQ{eY*7X7O}=b1Z6tKN#Db%~pc5#eT!UQf-h~!75|um4-USi{AG2d#K^c(N}9~4T8{^=+lMByf-^}=^#RrN8j=O=
zILfk^T7Xh7+XHk?4^MG*z)0d~v|`XDg$Gy@
zDrCE_or6O=)7mjwh^GY&kGmpvU-!C-D;Ye9WNZ{dV{~&!qiDk2pl|qltgx!ML_k
zIm2?esyTCWlGLxrE%XTi&SHbYE%DM&)w+2+$0d>3`Vx>m4Lb^e;;CZols5Pn=SbI+
zjN(#sg;MR0gkO?~>rL!?415nDd%xT7U?4!fZm*TOSJ54md~qOV?NUHSK_MW4YiXs`Drbq
zNi$-!+pM0kex4YwQ9fWEQc6A}{ieA_Yo<@6X}xsqH0vJc5?tEnOzE>IyL-6p;RO##R60VcX|_|EZW$gg6Txy5Vvgf~dP0gi1#Mxf
z>Q=0!tK{9hXrg8PplRJvF;~shEGawWulo>|`*a-__S)uFtXy@PLRjB0GOl&VKYahc
z!-aom1}34N9L-%Zw7P>6w~fr!cXkw^{q-k{BOFd`5pzyDa?eA#yYZ1K-4R{IHz_5y
z>NZp~_0{r@P9VW!$6EAifsBLfSFafKa6Fx%6K!huo-a)bE*0a)$`ZAYYE$u7v^gt%
zBFEVhS($%L{?4Qb2!TNa9qBt^Bwf6~zbTERvNH
z2u(fiX$wiJej+rIkNg5Lh!WqmRZJ2;1C^#Z40o(>n-O*mdQ1ZpX2`N|J0c_Bi&g=$
z=`&ZZW$x(O7Oncb3W?>npdexlX2YxtwjHA)4f!P;jfu5f7y5Kr%G;PRaI9Xu#5-*2
z4$D$9?pWwe*kkXk#joq+mvExr=)tHAIEQK;VZC+V8IfDR^N%}5j-K4Aw)OYsG@oz4
z(=!YUiaFzR?n|j=bo}lBlmAR_XvaEwoXy_?$ciL+biQX{C*bw%!3`&?o+gYjV+ucwJwNB^ztSd5uvrf3^dYINpFi}G0G3GG;c07
zRUswCKb_)B?heoy^OUV;)nsD=9)dLn!GvMCKjU}fR+FREui0&SJ+w!a-3N!w!BHml
z?=k>s+?7KWcgAVt!y2$X2=<^3BbZMHdxsr
zZgA#|mTI)?wI?l}HI(Kg9Q-`Ll*SXrPKC#4HHf(6T2#F%HC{VziqHOZi3Td7SML-(nfp;0er7R
zTiJ_N3Sn&_&jUBslRP(0$TgY@^cJAm3+HxEx5=JMX@2sT={upkEv%1#!jkqIpb2eQ
zZR?q~($EcjBHuKH^HTi{X+1(C4DXMg=h9rp0i7Ub5P$pL4WE`B!hCIDbwh9^eWMdv
zsj}HeIt?v#Aa+nN0cgsAEcc5qKj?12`yfvtrfQm_v}Y(inYa$b(8*`PdcPZrA$`hW
z&nrF}HKA{@ovGdGs&6E`fli))P!bA7G0LPp`V4=Hc`m0`&5x#E?rwAh0Ao<#?OTd2
zn3lej>Vq=t~J)9nd2Bc_B;G$1kQ@VN0Z&xf--tRX4(L`_WYV
z-gbsXDE6vm4!}8<1*Fl}CC8Xg&;xz23tuKZ$CcSzaec7PHDUJw?dK(|>w#byT4|6@
zq_GcmQfOgSt{^ARl-+PPf=w-r(#d2S50dk_vwBzS(J)euK|G|VQW;yLhig78dR@-C
zDEBkU2^nl~*!G|>qXGx&ZTvu7`ur*8X|JnU*v#|NB7J3qN0pj!?~K|p5tQGa{5y9>
z!D!Zo48y4ll$!_#2V;=a3^|3gw|C4&)d05v0d$!`D!@AmN
z38Hs*&QG47oS(coI>Dpy(bK?-J41$rF89J`*>(O-FGb>{4*3>m(Wx)vJXGbp&U%fhc^Wx+pe0Ms0$rolWhUa`0_ucu)
zad>w6A{-wM55wV`!&iT}IK2os^*1lV>%*g$r*BTeD_&fBqYH;`-=3dwQAOhOllP}5
z-?8g$&1aY%%LDVBw>=$}DFqJ8t4|j|>L9#;4*N3nke8^72_LsqN}C2sWc|#8^09E{
zI^!F6*$q)3>AJ{vG>Ls5efzCV#xu|zVCfm!rjuTzp>ovTB}E{^6xMlkZM;QbbNR`Q
zCustO>6TPuu+q{?J{p1FK4BcqA|mJWV>==&!w9P?14YEF>=W7}Y`wmtHR
zT^&|;fy_XGw}`a$voz^+m_DiF$Fm?()246v7J-THo^FEAbn*@$QQG(~)W~n@DPKer
zF1$o7tyOT0ZBXbSr8(jKs=@?b?df&!Uy3i!0c0=ezvMNdDs942Q
z7m@7Mto#7BHwL`&%5UHUcwu1tCjA0_1^@ov7ZDj*Rg^TCX<(+iS(O>_#dp6({lEYH
ze>!U~CsjGC2E|Q1yDZ94QGG6l^Kw>?M@4^peOHXni@oXfcv@}!IGzlqTgBtP)#rxx{hd0{b;VizhIrMiPcGfy;C#*K8r~UbK3TWWmo=Hjs`_JpIwe8
z1xudS!)jWb^T+J6o)*{hGuH6Kl(Xk)^P8KStw}Yl%1Qt7g41t}s@cQr;fFhqo;>aL
zo<4rsS$k1m6#M0@0HLtqgok9v=uKHJjwThAUp>1{vlvDD$|-s$1HwE#}w5aXCOl&hEfK14)61
zbbH0WpMwyH(c9{*onBUA{z-Akdz0#)=HQPPtI=f%Bfuszp>f7jgYnHMEGwqv-S4{J
z?R@ud?4SSkyNBOB`Wv#|GUqLlzx@5#?|-+^=7OU9{T(0Kfj^9gpAhWt=aXvq=l{t!
zzidQYoNPH^{`}8H|M$0@wX%4@fo3nt(Wl}#2$_k%e)YK;j;{?1FF{2$Vf9agY6LNl
z4mMBU9~XODCx8Cut&QRk=^mZUha1J|eDbM+u4fXT>p3qK2crvku$qWc`_*+hnQ6)9
zrQ(OG9GWs4IFpQrziLn;rp|pd+S2zZhd`QN(TPn&O~6V8qBw56RF38Mr*$t4ez#!mRfHs
z@Rz9dNquoSgKM{UHeub}pyG?88Jc=FfEy8wv*~P7_GiET>A#+g&u2H~q$&_`(}Cz#
zxS^JEhfp)A-hMTm7PE1*ABfIn1*1z7!|hIbL}412DzC4jZk`rb72>T2B{TIz&s|sj
z`aCFaSsj+D3f~t&n!o}V1za$2Z6Dw|kGk27z!|z5#auSK0_Yzr%#J}inu1^3_Mheu
zgtG_XOz*XTlp3F+=BMX(_L2#D09YIdl!qg}0m$iWK0tPEDq%*(v&dkGNKC-;PB}Bf
zd4~F!)x-K1&V>m{a3rwL#xS-Smubtmz*QinUl|AziMl@@mIxRi4A9FL0xUrT(juaS
z2mzG#+B_?#>^_lQ+k`ODvgN2SCt!Lxz6p3J&w#l>UpRBNOGL%zdLUFxs%Z9jbT*#d
zT|?Td>!E4LV6H*7+=u{mcqjr}CU70lm0)faKe8EY=H$9}tGKA)_Iy6-%lW~qh~A^|
zXfyi=(YzcFrjbt!N%!mx!vfoMK7`ro5q1+l!wKCY#4*Eu6^;7ZYHR=$6;NG{V$M5#vzmzM33PMf=
zu`BS4$@oTkXHZ;MW0<9oMyf~X%Fiw?b3fx{g@zS|rZXIvfolMNc~I6;j_*yz+|pyA
z{0S{w%PL(@DpPV-RXO6B^Z76l@_aJB($ZIaVv5T2U}S$72_`c|sKr`hWsKOUvll@#
zAsCl+|FXEkkwW-OoRO=udgL;IHUepQ$*8FT9&dG3Z{{CplljO#0iSxvPJ*L1EiS52
zHNio@EBcqXX?@ORS_L2Gb2+K28R`kl&95r=p=Fx;lHrftpErt+y`Q&=5B?lSMOsZ`
z3aKWnKxTRsFQ);hfK3881`I3M28sPyg9T7aI95}jiDIOscsN4P>}PxUNh~1Ifd_Y^
z@~Z9^#0{p%11MX#0#b7#s!=kk6MaC4Gu)ijSFR2s739Aqww7fPs)oZ#?gc(zIsH_8
z+yM`JjUXhg*%FCdH5w(M#2i0gr>)L;6x%kAYFNNkw^biW;NLg~fHj(-Ze%m+5WU18
zOmMIWPqE6Ce8_+^S(My>e%uAnH)Hg8Fk#g#0%zw~HjR>-(Rdahf@^e|DF50oVWqW)
zYZf<8#Si6RJiadWGqHY!+gEr1RILB7|LWuc(IH_(OEZ~Fx)29ZF(PBqS{kC78Soz;
zaV%-^$;G_Uo?mbT66EnXNu7!Lf|U)}z3-gD1~PId?4j@{bssLDBcyP9PKl@t^r0~$
zVUW;IXhcYdGcl6YXBE&g8V?XO$3!tDXo)&wjuM_AA$@q%)R&<{(y`H5`}=QsK~coF
zEiH!yof{Xu#(UbFU&~R$munyp2aM(xgK`={Zt<)DBZfzj$auUX=n1TLRsIo-WvqfL
zGpnTfd68!lO^`9Iz0z>y@4LH)F0HSWI=G%s%!Oj9g;z~f1)XW9OzP^9$%`osG8Gfy
zDA0cn2jVK>AaWW%J7S27#slM^oc#0#JPO_N6jE-W?=vY0w?UW+`^v-|qcFC>Z|Zpz
zafEybQ@L%RIoxqOXAn;zvFu|n(N9AYNz;^@;+6$zZ@j?h_Z1%>OKm^VKc4>V8c->S
z=p}x9rf<62h|?(!pL|q2JEa8tkH%p=ACqGd@4DM4Az0F
zHp~z#rPjyzlOuv#;H%SRbywo&WBA)>1bjt-B0Y9mj=aU^#PU;oOp6VXNj?m0?0*qHFjvR2k{uV6iVgk2sfZ&
z%Hd&td4!xIlhUarK1f6v3Ct+7+?+T@51Ja9jRz&}l4z2H9CsfOsr6^L7>+oXZpS(+
z2!JJq2vidi^nE|GSUAhX#}co4W|3Wr`Xm@-abDdtVT%bo$N|Gk3MIB!UDbAG?oQJv#9z#y9%u`wou*Cx409pF$^}q@%
ztC%$lqJryedIDw>FD!&vc~C8CiB+&7o5qw`eAT=_Ae`+<#kw@{E
zdr=|ePIl5_o?3wYyCs-Nj@Ni3Scph*n2aq4W0awcko*Z{&&cR!2^d%?*tkV^Fh9j6
z9&kE`xMAK`G@}Seh9V0-zljTwz`lSrzVe*JM)9LX
zaPd%^Qk^n2P4sJls)`qKMcmJTVlJY?3M?H4D}1dYfEi^fAxfQ%i*hi4Rb90%^K4O`
zv1XR#2?j4EQ}lPjeWTI7nL!gza$d258OJ7xAm4mEpABV+U7zC>Datp!WReRT*|Fq3
zyZB9?>$QMeB$PuCL=vJ3Ys2!+aMa`pC0m5ILAnHami>ur)f_G9A5e_ZGp|lh72lBF
zxkvAqG0gK~tMyKk
zKM&h7|EqDjiXWrDkV%y26OwL00WYE2V8Y>&e+)cr>lB|3FRup8d?C^kq7vG9P|e8C
znERnWZ!X6;zJc-Do+?4P*a~OsOY0ke0)+L?W$_0!9v=0!SxY!GzT6uU-YENmZhnnh
zN%4!+FmhL{@4Y6T{n(Y8eZtquu96`c5|&c2!!VEW^u)mHCm;6ppj`3yL=2mS8?2X9
zmylM-E#pnBtc(oXtiODv$Di`}Avv+YW-njy*|VRcWhNl2Zf67_CM0&Q3oUb2_CJx?
zLg|B_Cgc%bNyu&%q=1x)!4Laf<36Fp9mnQ{ocLu4abTol;>3F-
zG{|Rr;SUX@E9l#BSW;I(a{~T~=OZXh*2JBM#t`B$z9M->*2^}AH&yVIiXwPZN?sU<
z$NnJXl5%{p_wpaVKYsWA?S8Sx|6d=T9vtscj5?r_`r+{4$3{IT^q<$5*cQ&F^3xZo
zfrsM{0jdvlMZL
z4WuY&tdI&SWqN=w;OcW^vUP){8l2(8>QvLENVrc%C-b@gq@WtySD6tP=hX4r)@Kh%#-jvtJ
zo~z{q<)FT}BK55#35I|}UR?5o9ad=uSEXjN=E}XoHK6ov8M5^^ufcp){2-u}H!XG?
zD36I1ugNZ#=3TRl)}#=uoGgt31FW8lxl}xqnilwtca-vmLz29zm1sF~iQHs_8dW2&Pj0
zDh~yUvkNK36`(bE$!ip}V94~WTD(;xDhR918Af)pm9qWGnmWzUR0=#ZZuN7cDJ+^^
z(6`xY&L94?qq}jII%~)6?A?ji-JTJ|bg&)dti1hBL3cdz9e1w-xG=+SkH*
z6rPsRBM)k53MZP~kOH94Z0_YbnL$}=aqNu4TJeoeFr|SkV#wCEC4c?Xf6J&g4VhNF
zHgkC7IZ>shMjO`M2!Nyv8G-_01D)Mhq~eBvPVns5n3J2+l8S+7VFSbvgQB2WgcWe|
zcl9b-`mNm+foHQ5!(19
zJ{R{Otd-gXe~Obiu^cd^K&s(XZxyE>l*N(xH`=1BMluvAOPxWG%f6t_Ffqfbx^|Yb
z2+O!Y-;oTA#7UIJK3JxfG8o-SuGr5NH{xDIvqzMS=Ht7;7(z*_hrm)06VO|rpbU=J
z27%sbCHZIDh_XTzA*{3Y393Ua#0K<>dF%k{TR}zkg_cVB&BgbOFy^r)fml$6L9aN`
z?%Bj67lPW!7|%g~i~AxDzUuM@Ls*jmlm*sN+-;|zM
z!o$!J62S{Tcj^z6~<(-IsaKT
zyQ!4^L;HRv-b9mZ5n0cQTV!ES+!2o=9|&dheFCO|j|8VL6W?OTNd6kZ|wMD?z8sBvT_M82ne}i?J
z2-{pCLMK+UavQ;?XYLRTU~6*~GDx>f7jav`Oj=8wL6o
z+x{4>8#=1uPG96IvB^5@@!j?9gY_da2*r2B-Gj~=+%^e}M$)IpC%1RSC_P==KIj>X
zt;2tM1HSuxpoa$P3WZ38QiLk
z)QY0gcCHsqQcTa_`6uHj>+4B31v?n)5NT*bBt&IblLq~rr$@=TpSunKAS6>i62(KP`mi=Oc9m$UTG3v%^B8Q#mcFlS;EwNAKO#DUaKsdin(
z63~kN;pRIc60^=hz*Ea~bg8D7bu3|K>wN6(TH``b9V&nrQ(PkMLa2c<8mNI#2GU#D
zaapOys1gwrEb`HN^0OUN^&=B6X(`xavsjU_z|E6uQ!9;b;Ze*o*bt3NiJ7COr|%$gN9Yx`t48P?myGm4!Wmq4xwcVbYZvgw+u&NTpmF(2<^~KLwOG2YYHH
z31y~R?d70fk-DawmTA~q)cp(SfdYjFswSOS<$lZ31K#u;>8$jt%D6M?c451}@TvZUZDLa!(YsIZWLllc0n59?y=%~UMZTqV0qJzXA6)=9%BsY0N>PI-l2>pu+BK7qLyuL3k2}_~
zxMPX10A_bL)rYdgd9+;M&plm}X^OG;fN!2}t0J^vqm(xB1!N+T`A+;dEFW7ukcWE?
z=54V%-o~F~!W<2jN@$7VCavG5wE?aKUT&o8EXM%t2rTv4&5+n;V;wKcwC@02Uu8}gr{0rd?Mu(;n_%~@780f9iR
zZAitK)hJ}oj%E`VDBT2z%@`b#s0wArwD0SB#eNCd>HVdHh=c|Ky$$OB!4BdKPsPjz
zlyz-qVIfXpR!Fz=FCRG;z_DsH(8+;W0zh1JhM~Bmz1&PxJQ;er4xIvKlZmF}G6jx6
ziQIn#t@#Z|+){wavduihszt?732M1d=F~ZHurOhu1cwS$sj@;dQ6ftNxa0Fp{)cty
zSJ$&_%miQ1*byp9-IHC8y#bbF4Y)(NnogOfyMjk?02pMyQ0db+SHu1)e#j)))|o1T
zb*RlmHZ1!##+&S;Cxm&WhVpkfJHX)pMP#v!;T&rScO1
z$Ts|^ScHG)eamrcj=W@*z3#xNG|DA?FVBA!JyEHrmOi!i=|sJ-BX6?$dqw|;8d$MW
zk5hD={LYU%HY&061m1F<$T>o_OdrHj0dkS^thHg)ghZlVLa0SK%XCasOy(ereHm}8
zSQ=_W`!qTk$ET?TPRIm==|C?B#-XJ!Fk{Nk1=+ctGy_F;csJivB&uAs`^KdYeQyRRDNw$x>HW#!TJmLjynjm4sBI(g2
zfiguA#MOvF6zL$OG3DU&hj;t$UcdYPr>td8U3ZIGGcDNcmv3`FCWvh%!C&Z_*QBrp
zi7bz+c5CZNK(3s$-KO>S1`tg*uu>l>8&*7b_Y`k)rg0P0Z;
z^-hD09cF6o$i&H($Je5si+DMi
z6UQ?pt3@}8SIwmVu@7@g8BG%w{W@qOEH&>-OX|i01BiZMd1`iLhMKB$
z$V}qm;Yq)#obG23b0$&rfTyG__2iV1jLp3xGQx;Eu2tQiFu`ckiEbGePr*8>SdtL3
zhAwf7#ZidUL)r?juO(LkkM2Fiqxw^96e$tMcYC+@
z>qlCzGWucjZ>NUPw|MM)``ke^drg?F1d?GQi&3TGf-lf`Z>ELOZE7xd9m;O+>O9q{+)E%)W457i=L(jovFC05UIlgHN6N|)KJma`>6|7K!YmNjl1ebog{
zTAa5J0!i;mlQHtFi;@Gv0zwycW#p3vDNT^7Y`u$RI0%?KjPrO(&*O3iOy+KO
zz~u3QbKTs;@>_Y=%04%9732h(_}BQaoqqB>lRKY`{HYE8teoLhW^65Ik5#JjD*x2R
zFN2@8^oo7C%BM1yC%TMq0!_GHhK#(tdLJkKJwUA5Cnl!N)_bpvJS20NDW**B9$I-E
zpDIb8>#Qog2`f5yONj=5?mpcnU{u^nR<*(D=1QDy=16s5qE(8%odq@G@fP=-B0FZc
z>yb*GmR!*p1z&Pok{5S_`j*w#WDwASZr!^>M>d0{bgNiWFH|kP&60L^Y$+B?mA@j)
zCbT@(EF`ggL;R`zSNh~YMhiX2`AamZD|RyUcDn`dtMIBVn!$*q+{`onLJUZh$;$?S
z&YsH#4beE!!EmGHA`(GVLT((YO4DHB!V7Q45>tn&a|l8gtxgK89S(@`NjY~&kwfx!
zeDWV&$GNqbv~&J9YKPV@@M04Hu@lrF#sQHndU(!otnv~BZZo3X08hhKAkXe3vz(e%
zg|`rLN2^Oqc;|k1$9PO>?!Z`f7GX|-3iSp`50+NcqQY)KE+-+Li24&wHY@5??1rIk{^QZX@u5CAv8mF74|}iQ`;E9ZM0u1TJB$~V8J}CAneqpQ
zb;sp>srVLZHw!f(7;^Jr^ac%Wp>qDS)Qg+%#8kF%%CcCSOnMbP14G1=!zNw;+__a8
zSk@v-L`VaQzDnVTlts~`EckMh$mar8Mmx&|2o8CA2ddEn8SrpkM3yVNqdS_OEt*d2
z49P#uX((&z8TmUUR4lI%F*4Ecf>KhkJYZ@JHKaQRg>plI*T;Fz!et&+V`%ho;O~e9
zzZME6Nu;yviltU!d@t+a$`mJoGLG{iJGE==4c4W86Y$Zx=1RqkL(pG%YMGS?I?L--
z=|<5vToyFmf01c;9V?>LPMt5`(pffZ>+Wj%jW7C}p|gtuhJ32b(ix8?z=ZFB1#2FN
zJb>3@;U)*w`$3V*W~xw?;8?{ByeE`3>=5FK#=%}xj5(zr-POLPCREzHS1DF?|JQLY{8eaMyb=O88?z+0;L
zF4c>5g>yZn-P&eZXe0&?e$}^;MW?f_2S0x_W6nX?mBu
zM8#DOel3^X*omU8Dbj}?Qkw*XcB5iv>(TZ$ZTWUZUxWyqH{zmLvfy<|_Kqb-b}h?`
z-ivh605vu>nKeKM6|L*RT&It`!V&cxWb)CL-gftqakX7d
z@41<{P)M+mPjeJu7pTU%T<6tBQONvxsb^E-z$o`e>QixYvtXlac~+g$u#eGyS!ZCM
zPNLHrhc&6bhv7)|X}ja~8-<(8XV%agXBsh&A4V&q42JU=UxvOIF`%-{3(Ot;+QtJ#
zM`k%Kt8Kc?Yz=oRm4gAaiLq!_0)(NxSS^&eZhrNnnY)GSQv#44V9>MUBr`$pZCh2V
zQI}1lx#@q=F(t{R3Doh#hhfR*n2Ia;3@AsPm-$Y2VdKyX$sE7Ql?O1gTu(jWWx6g5dAHQE|yzepOD;JKExggjTvi|ULf4Z=GI8f&I_S
z4jzP2su0Xr@S~H7LE#FRK!X{OhdwLQxePQuD2`!lezK!@<%;u_r8r`d$~lksqpH;5
z7Ze3{)T)GN2VJ0E{*a1z-vxw2ZHak*C34F0f&
zWhAPxt~qEy1z>X0gU&Qj`8ZoO3eT>Tj+e$%Fvf}cxw%HyTWw4~tlSk!IWqK1BZGR*
z_2H7A*Rq646v6YMKF0-4BxMXkt-GqG94~<-M+zmuWbmjswN7!mBLel(Lzq5B`t5%1
zaQDT8k=2o~Gz3!X^>tGm8ii#Kaj6=!;l7k^#SBk)le!pdW#cB@7FC0m-VGqGJ$V0BaDPrpDuG;G$L*8(~eOCPJ1ET>uTJ(cjH9km+AX
zH#3&u>)>gG?`kUcWSUIavaY^^
z-Az5l?V5s6&L~@PGxVNSnZjYMXrRv7Y1ndXOF#$97(bivARA3%(_znl#4j3PMnp*Luv}w)(SK<
zfVpMZI3Z~je|M<+)IJJ!N-0zDAB|7$tR1>W39D6kDiqV)OsEEhU~0yHJ8Dp@AE2sC
zCT+8Q|Ai3CUD39qc_v2dg7}2uLu_``VQf#cD-2j`xJ2VKi)PKuyOUlRZ^RdhC6@yN
zCsM>&J99M?f^aD81#KJz*_1;PdQNt!U-^*L=jH;r;Uc{z;&L^xVdgWMlj3ym#cNVW
z&cP^fyPhJnFv_fJyY#TLxy-LDObRU-?>&A;uIcpf;G}r>s@UH<-8(rT>m(L`kIqO>
zHg)Te77z-;__e}*Yv$8itH!3ABAB>|A6<}ju+8>=$OzKlKM`!_iW?o(`*bht1EE!Og1k)Y6EZ6e%s*&5-q3U?FLySJG8(~Hj)ZGhEc6j$htfs!ZSA;bz(
zXU&k2=N5~_8@)uAv?eN1xjDwKG1R>WKf-#3NiCN
z$DA=6o62s)4{2Tbl&J-v>G1)vezsMw)36~`?N?x?-JC~cN8~ShXaK@}{0(d|d1BFm
zK1rDc4z*9>E7+t)a=misGtf$1wK)fu!GB97^4@x{_?HC%;~$Fex@53^w|-zhgo3R<
zCFs|HE>~qNeAat_E?fGnv*!NX3jWclcUOEKN)!&_`Xg?H5!-^;LB#;e9h8TeaVF}C
z<+;MRN^C?C4`2B$cka!$Kw7_jfMU1X7}#CUT8_eF`cP-u^bIvms5;%X(i7&ubc>fq
z@3F)<3{owRi|=1ZfgJC>;X<#NW^RaK2@`Ybtli6gNhZLaXYa`lQ4UHsUMW!wZSBlo
z1wo9V=y1C8E3&PtU}(W`8h$42s$hil>v%$*%zR+rXt6#+Mrpjkw6sKqorDbD%?~Jm
zzy-D<)B4C8TFK+ypypZy-0aciOgc@JSik0y%f{4$hSjA7!+?NmnSIedS$;6Hc!NvZ
z+ARa(4NFAKmWY-Et~>cz72B*#k`>#Y7k|^L*Q}-Lx6&5Zlw8%tS%cstDIE6m4g5&{0M1@;c%be73!;4!12zDvjL<8EBEB
zaVZ^NCi$*<+FwU%kpiqQbjhw2Y8FAs!;~0vhDf~;*H$wyiXg&3E)KwR=E4_q2dnYl
znqnFjk^{6n7?Y8f1EbC5@iv4Z0mToO*bT7ZQdO*?&asR}JEDI}KV2y6c;pv3Z7+Ec
z*=4Vm>aZPFOoMB}l$?_kzI&QM1{;12nt#zv4KY?s^tc0nN_Y1Z{Q&J4aJM>=n`j%A
zy4SRL$QG6heyVvECQCYC2;UF+&y%BrgZ<+D5k~uzIDGwIlN5hE=Fz5f2)
z@!{zYZ+Mjd?T7HV`|E%D&%KWRxnuv_?ey%w_0QeTj{P&Cz0-Z#@jMhi&}3;6nQ;aH
tM@o4TJLw_^G);-zsu-qkqkD!46$uaI`@8@527Lf;WuLtJoBV9-{|6VTnePAq
literal 0
HcmV?d00001
diff --git a/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt b/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt
new file mode 100644
index 0000000..e30d3a4
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt
@@ -0,0 +1,197 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+A Process Model for Test Driven Development in the Big Data Domain
+Daniel Staegemann https://orcid.org/0000-0001-9957-1003
+, Matthias Volk https://orcid.org/0000-0002-4835-919X
+109
+Staegemann, D., Volk, M., Jamous, N. and Turowski, K.
+A Process Model for Test Driven Development in the Big Data Domain.
+DOI: 10.5220/0011337200003335
+In Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - Volume 3: KMIS , pages 109-118 ISBN: 978-989-758-614-9; ISSN: 2184-3228
+Copyright c 2022 by SCITEPRESS – Science and Technology Publications, Lda. All rights reserved
+, Naoum Jamous and Klaus Turowski
+Magdeburg Research and Competence Cluster VLBA, Otto-von-Guericke University Magdeburg, Magdeburg, Germany
+Keywords: Big Data, Test Driven Development, TDD, Process Model, Design Science Research, DSR, Microservice. Abstract: Big data has emerged to be one of the driving factors of today’s society. However, the quality assurance of
+the corresponding applications is still far from being mature. Therefore, further work in this field is needed. This includes the improvement of existing approaches and strategies as well as the exploration of new ones. One rather recent proposition was the application of test driven development to the implementation of big data systems. Since their quality is of critical importance to achieve good results and the application of test driven development has been found to increase the developed product’s quality, this suggestion appears promising. However, there is a need for a structured approach to outline how the corresponding endeavors should be realized. Therefore, the publication at hand applies the design science research methodology to bridge this gap by proposing a process model for test driven development in the big data domain.
+1 INTRODUCTION rather recent proposition was the application of test
+driven development (TDD) to the implementation of Today’s society has developed to be heavily driven by BD systems (Staegemann et al. 2020).
+When done correctly, this could solve several kMnaomwlloekd ge2, 0i2n1fo).r maCtoionns eaqnude nttelych, nobliogg y d(aLtae vin(B aDn)d, issues at once. Not only would the quality and
+respectively big data analytics (BDA) have gained flexibility of the developed applications be increased, huge popularity among organizations that want to but possibly also the trust of the users, which is crucial profit from this rather new resource. Furthermore, to assure the frequent and genuine incorporation into those who do incorporate BDA into their processes the decision processes (Günther et al. 2017). However, experience (on average) a significant increase in so far, there has been no structured approach productivity (Müller et al. 2018), further justifying the formulated how the corresponding endeavors should positive sentiment. Yet, this only does apply to proper be realized. To bridge this gap, the following research use, which is, however, not always a given, since it is question (RQ) shall be answered:
+a highly challenging endeavor (Volk et al. 2019). The
+arguably most common issues in this regard are a low RQ: How can the process of applying test driven input data quality (Abdallah et al. 2022; Staegemann development in the big data domain be structured?
+et al. 2021b), human error or bias in the use of the
+applications, and erroneous implementations of the To answer the RQ, the publication at hand is respective systems (Staegemann et al. 2019). structured as follows. After the introduction, the
+For the publication at hand, the focus is on the background is briefly delineated. This is followed by latter. While there have been numerous works to an overview of the applied methodology. Afterwards, facilitate the testing of BD applications, it is still a in the main part, a process model for TDD in the BD rather immature topic (Staegemann et al. 2021c). domain is developed, which is also this work’s main Therefore, further work in this field is needed. This contribution. Subsequently, the model is further includes the refinement of existing approaches and discussed and avenues for future research are outlined. strategies as well as the exploration of new ones. One Finally, a conclusion is given.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+A Process Model for Test Driven Development in the Big Data Domain
+2 BACKGROUND heterogeneous (Freymann et al. 2020). This, inter
+alia, refers to the utilized programming languages and To establish a solid foundation and a common technology stacks. Moreover, their properties allow understanding for the further explanations, in the an independent deployment and usage. For this following, the most important terms and concepts are purpose, usually continuous deployment tools and briefly introduced. pipelines are used, allowing for the automation of the
+procedure.
+2.1 Big Data Even though in software engineering componentization is generally considered a good The amount of data that is being produced, captured, practice, achieving a high degree of modularity is and analyzed as a result of today’s society’s often seen as challenging task (Faitelson et al. 2018). digitization has been and is still rapidly growing However, when using microservices, this is achieved (Dobre and Xhafa 2014; Statista 2021; Yin and by design. This also reduces the effort for maintenance and the implementation of modifications, since it is
+Kdeamynaankd s2 01f5o)r. Ciotns curprreonctleys,s iintsg comalpsole xitiyn carnedas tehde. often sufficient to only redeploy the affected service Consequently, the systems that were previously used when incorporating changes. As a result, through the
+for this purpose are oftentimes no longer sufficient use of microservices, an evolutionary design, which is (Chang and Grady 2019). Therefore, new tools and driven by frequent and controlled changes, is techniques are needed to deal with the new promoted (Krylovskiy et al. 2015).
+requirements and simultaneously the term big data
+emerged to describe this phenomenon. Even though 2.3 Test Driven Development
+the origins of a term are not conclusively clarified
+(Diebold 2012) and there is also no unified definition TDD is generally seen as a development approach for it (Al-Mekhlal and Khwaja 2019; Volk et al. that (for the cost of a reduced speed) is feasible to 2020b), most of the relevant literature follows a improve an implementation’s quality (Staegemann et similar understanding. The arguably most influential al. 2021a). The corresponding advantages are description (Chang and Grady 2019) is based on four twofold. On the one hand, the test coverage is characteristics, which are sometimes also termed the increased. This helps to detect errors (early) and 4 Vs of big data. Those are volume (number and/or prevents that they affect the productive users. On the size of data entries), velocity (speed of data ingestion other hand, the system’s design is also influenced, and/or required processing speed), variety (diversity since a major part of TDD is its decomposition into of data and content), and variability (changes in the the smallest reasonable pieces. This reduced other characteristics over time). Due to the complexity also helps to avoid errors and increases widespread need for high quality decision making, maintainability (Crispin 2006; Shull et al. 2010). BDA is used in numerous domains, such as Even though the primary application area of TDD, manufacturing (Nagorny et al. 2017), management and also the one that is relevant for the remainder of support (Staegemann et al. 2022a), fashion (Silva et this paper, is in software development, it is also used al. 2019), education (Häusler et al. 2020), sports in other contexts, such as process modelling (Slaats et (Goes et al. 2020), agriculture (Bronson and Knezevic al. 2018) or ontology development (Davies et al. 2016), or healthcare (Bahri et al. 2019). 2019; Keet and Ławrynowicz 2016).
+In the traditional software development approach,
+2.2 Microservices new features are at first envisioned, then implemented and finally tested. However, in TDD, this order is changed. While the first step remains the same, the
+Tdehceo mgepnoesrea l aind eean ovfi sitohne emd icarpopsleicrvatiicoen coinntcoe pste vise rtaol identified functionality is broken down into small smaller services that then interact with each other to parts (Fucci et al. 2017). In the following, tests for
+accomplish the given task (Nadareishvili et al. 2016). those parts are written. To assure that they indeed test new aspects, they are run and should, for a lack of the
+Ufusnucatliloyn,a litthye. Thsiesr,v iinc etus rn,a arell owbsa siet dto boenn efibtu fsrionmes as actual implementation, fail (Beck 2015). If they high degree of specialization. The microservices all don’t, they need to be reworked due to the premise.
+After the tests failed, the productive coding takes raumno inng t heeaicr ho wotnh eprr,o ocenslyse lsi gahntdw feoirg thhte m coemchmanuinsimcast iaorne place, resulting in the desired functionality. The main
+utilized. Due to their independent nature, the focus here is just to make it work. In turn, other particular services implementation can be aspects, like the elegance of the code, are not
+important, as long as the previously written tests are homogenous toolset, but can instead rely on the passed (Crispin 2006). If this is the case, the code is technology set they deem the most suitable for the then refactored to improve the readability, its given task, due to the independence of the services adherence to standards, best practices, and from each other. In another context, TDD also conventions and to improve its overall quality (Beck increases the flexibility. The created tests allow for 2015). While doing so, the previously written tests are easier and safer changes to the developed application utilized as a safety net to make sure that no errors are because they can be immediately validated through introduced during this procedure. As mentioned the existing tests, leading to faster feedback, the earlier, this focus on incremental modifications and avoidance of newly introduced errors and small tasks (Williams et al. 2003) does not only affect consequently more trust by the users. However, even the coverage, but also the design of the developed though the general idea of applying TDD in the BD solution. Moreover, developers are provided with domain seems promising and there are already some more immediate feedback, due to the shorter test works in the domain (Staegemann et al. 2022b), to cycles (Janzen and Saiedian 2005). While unit tests facilitate its diffusion and make its application more are usually the backbone of TDD, they can (and accessible, it is still necessary to develop further should) also be amended by other types of tests, such corresponding patterns, frameworks, process models, as system, tests, or integration tests (Sangwan and best practices, and approaches to provide developers Laplante 2006). Hereby, especially the latter can be with a solid foundation they can lean on for their seen as essential (Kum and Law 2006). Furthermore, projects, instead of having to determine all steps (and to make sure the necessary test frequency can be their order) on their own.
+achieved without the developers having to
+cumbersomely deal with it manually, TDD is often
+combined with a continuous integration (CI) pipeline 3 METHODOLOGY
+to enable test automation (Karlesky et al. 2007;
+Shahin et al. 2017). Consequently, whenever a In order to assure scientific rigor while answering the change is committed, a CI server runs the existing RQ, the design science research (DSR) approach tests, checking if the last change has introduced any (Hevner et al. 2004) is applied. This constructive new errors that need to be fixed. methodology is geared towards the development and
+2.4 Test Driven Development in Big evaluation of artifacts in the information systems research domain. The purpose of those is to solve
+Data organizational problems. They can be “constructs (vocabulary and symbols), models (abstractions and
+As it was already described earlier, applying TDD is representations), methods (algorithms and practices), a promising new approach for the engineering of and instantiations (implemented and prototype high-quality BD applications. For this purpose, the systems)” (Hevner et al. 2004). To further enhance use of microservices as a technical foundation has the comprehensibility, the workflow of the design been proposed (Staegemann et al. 2020). Since a science research methodology (DSRM) presented in major component of TDD is to break down the (Peffers et al. 2007) is followed. The DSRM desired application into small parts and microservices decomposes the DSR into a sequence of six steps, facilitate exactly this architectural concept, there is a which are depicted in Figure 1.
+huge synergy that can be exploited (Shakir et al. The DSRM begins with the problem 2021). Their use allows to realize each business identification and motivation, which are outlined in functionality as a separate service, which also gives the beginning of the next section. In the second the option for independent scaling, depending on the activity, the researcher shall define the objectives for respective workloads. Further, this also impacts the a solution. This will also be part of the same implementation process, since the development of the subsection. The third step, design and development, respective services can be distributed across different will be discussed in the succeeding subsection, teams. Additionally, those don’t have to use a resulting in the construction of the DSR artifact as the
+
+Figure 1: Process Sequence of the DSRM According to (Peffers et al. 2007).
+main contribution of the publication at hand. facilitate the use of TDD in the BD domain to increase Furthermore, the underlying explanations will serve the overall quality of the developed solutions. as an implicit, preliminary evaluation, which Furthermore, this process should be easy and corresponds to activity five. The final activity, unambiguous to follow, which on the one hand refers communication, is performed through the publication to the outlined sequence of steps, but on the other hand at hand. However, due to the artifact being a process also on the utilized notation.
+model, whose phases need to be filled with concrete
+activities (which is out of this work’s scope) for its 4.2 Development of the Artifact
+actual implementation, the demonstration will be
+deferred to the future. Since this work builds upon the MBTDD-BD
+proposition (Staegemann et al. 2020), it will also
+follow the general structure, which results in the 4 THE PROCESS MODEL existence of several levels (system, component,
+subcomponent/ microservice, method). Furthermore, In the following, using the DSRM by Peffers et al. the wording is adopted, increasing the (2007), a process model is proposed, facilitating the comprehensibility. Moreover, even though in the application of TDD in the BD domain through the following only tests are explicitly mentioned, as provisioning of a structured approach that supports suggested in the MBTDD-BD, benchmarks can also developers in implementing their respective BD be added alongside them to introduce another endeavors in a test driven manner. dimension of quality assurance. However, the main
+focus is on the functional testing.
+4.1 Motivation To start the process, it is at first necessary to know the requirements for the system that shall be
+When applying the DSRM, the first activity is to developed (ISO 2018; Sommerville 2007). However, identify the problem that shall be solved, and to in the context of this work, outlining their gathering motivate, why this should be done. In the case at hand, would be out of scope. Therefore, the list of it was already outlined why big data is of great requirements is considered as an available input. significance for today’s society. Further, the Based on those, concrete features of the system can be derived. While it is not yet determined how they will
+iamndp oirtt awncaes odfi spcruospseerd qhuoalwit yt haes suarpapnlcicea twioans oouf tlTinDedD, be implemented, this step turns the identified needs might help in the implementation of the corresponding into high level tasks and is therefore a prerequisite for
+the actual realization. In the TDD methodology, after spyrostceemdus.r e Hfoorw theivse hr,a st on oto yuer t bkeneonw floerdmgea,l izaend . Wacthuialel determining what is to be implemented, the
+it is necessary to maintain a certain degree of freedom corresponding tests shall be written. Accordingly, the to reflect the individual nature of such projects, this next step is to define the tests for the system as a also constitutes both, a barrier for entry, as well as a whole. Those might be automated, manual, or a hybrid potential source for errors and inefficiencies. Since the approach and are supposed to show if it provides the desired functionality. Implementing the system tests at
+pbraospedo seTdD cDo nicne ptth efo rb itgh ed aaptap lidcoamtioanin o f( MmBicTroDsDer-vBicDe-) such an early stage on the one hand corresponds with the TDD philosophy, and on the other hand potentially
+cnounmtabienrs osfe vaecrtaivl ilteivese lrse aqnudir teydp efos ro fit tse simts,p tlheemree nista ati boing. also brings practical advantages. This step, as the Developers that don’t have extensive experience with previous one, immensely benefits from having domain
+knowledge and a comprehensive overview of the TnuDmDb einr tohfe BdiDff edroemnta ipno mssiigbhlet boer ddeertes rroefd tbhyo sthee (hwuigthe product’s business side, respectively the purpose it is
+developed for. Therefore, the process should heavily wrersounltgs ),d aesc wisieolln as s ltehaed tihnrge att oo f eoxvterarl owokoirnkg iomr pworotarsnet involve experts or potential users from that domain.
+activities, which would reduce the effectiveness of the Meanwhile the further steps are of rather technical nature and do not need that much comprehensive
+athpapnr oathceh . Striandcieti TonDaDl iasp upsruoaalclyh m(oSrtea etgimeme acnonn suemt inalg. knowledge of all usage related aspects of the product. 2021a), this additional effort can only be justified if By creating the system tests early, it is possible to
+focus the involvement of the needed knowledge tThhee rceofrorrees,p iot nisd ninegc ebsseanreyf ittos pcraonv idaec tdueavlleyl obpee rsr ewaiptehd a. carriers on the starting phase, which allows them to
+structured procedure to reduce this uncertainty, focus on their day to day tasks afterwards, while the eliminate potential sources of error and, hereby, technical experts take over from then. (Even though
+some involvement of distinct business experts/users next. Further, in succession, there is also a change might still be needed for some decisions that might from the component level to the subcomponent level. arise later.) Once the system tests have been created, There, analogous to the previous levels, at first, tests the implementation can be progressed. For this for the unit (in this case the microservice) as a whole purpose, the previously identified features are are written, allowing to later on confirm that the translated into distinct microservices, which envisioned capabilities have actually been inherently also determines the system’s architecture. successfully realized. When the creation of those tests Further, not only the services and their functionality is assigned to a team that is different from the one that are defined, but also their interfaces. The result of this is responsible for the implementation, this can also act step is an overview of the required microservices as as an additional safety net by adding another well as their interconnections. However, the concrete perspective on potential issues and edge cases. This implementation of the services is not yet designed. In also constitutes a deviation from the proposition the following, those microservices, which are also expressed in the original MBTDD-BD paper called subcomponents in the MBTDD-BD, are (Staegemann et al. 2020), since there, the assurance of grouped to components. A component constitutes a the functionality of the microservice as a whole was contentual unit that is deemed belonging together by described as only being implemented indirectly, the developers, respectively architect. Those could for through the tests within the developed service. example be the loading of data that consists of several Explicit tests were not intended. However, since the services that are each specialized to provide data from inclusion of such tests for the entire service allows to one specific (type of) source or the preprocessing that incorporate a view on the slightly bigger picture, comprises multiple steps that are each realized as a which is not necessarily given on the method level, separate microservice. However, there are no fixed their integration reduces the risk of overlooking issues rules, instead the definition of components is subject that are not as apparent when only operating on the to the individual assessment of the decision makers. method level.
+Moreover, depending on the context, components can The creation of the tests for the microservice as a also overlap (e.g. a microservice can belong to several whole is followed by the test driven implementation components), or just comprise a single subcomponent, of that service, as it is described in the related in case it is rather standalone. Yet, for the sake of background section. Therefore, at first, the tests for a coherence, each microservice has to belong to at least function are written, then the functionality is one component. implemented and finally the code is refactored to
+Subsequently, to later on assure that not only the increase its quality and readability. This procedure is components itself but also the communication repeated until the entire service is completed. While between them works as intended, corresponding tests the described process as a whole takes place on the have to be created. While all those steps, that happen subcomponent level, the implementation of the on the system level, are only conducted once, the particular functions corresponds to the method level. succeeding activities are performed repeatedly until Once the implementation is finished, the the implementation of all components is finished. At aforementioned tests for the entirety of the first, is has to be chosen, which component shall be subcomponent are run. In case that they do not pass worked on next. The criteria for this decision can be completely, the service goes back to the previous individually determined. Possible reasoning could, for implementation stage, where it is worked on until the example, be based on factors such as the availability issue is deemed resolved. Once the subcomponent of certain experts, the perceived importance or tests pass, the subcomponent level is left, the process complexity, or contentual relations and again enters the component level and the microservice interdependencies. It is also possible that a specific can be integrated into the current iteration of the microservice shall be implemented at this stage (for component.
+example based on above mentioned criteria) and However, this is not the final step concerning the therefore the corresponding component is chosen at regarded service. It is possible that a microservice in this stage. After the decision is made, the system level itself is not erroneous and, therefore, the testing is is left and the work on the component level begins. positive, but there are issues with the interplay with If the component has not yet been worked on other services. An example (even though it is not big before, the next step is to create the tests for the data related) that made the news was the NASA component, otherwise this can be skipped, since it has climate orbiter crash from 1999, where one involved already been done in the past. Then it has to be partner used English units and the other metric ones, determined which microservice will be implemented leading to a failed mission, despite both parts in itself
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+113
+A Process Model for Test Driven Development in the Big Data Domain
+being functional (NASA 2019). To avoid a similar situation, the integration of the subcomponent needs to be followed by a run of the component tests as well as the relevant tests for the communication. Only if those also pass, the microservice can be deemed finished. Otherwise, the developers have to go back to the development stage. However, in case of success, the component level is left and the system level is entered again. Now, the further procedure depends on the current status of the system’s implementation. If there are still components that are not entirely finished, it has to again be decided, which component should be worked on next. From there, the process continues as already outlined above.
+In case every component, and therefore every part of the envisioned system, has been implemented and individually tested with success, a final test run that
+comprises all tests (including those for the system as a whole) allows to check for a last time, if everything is working as intended. Should there be any problems, those have to be thoroughly analyzed. Once the source of error is identified, the developers shall fix the underlying issues, using the comprehensive test collection to assure that no new errors are introduced. However, if this last instance of quality assurance is also passed without the occurrence of any problems, the development process is finished and the system can be used productively.
+The complete process model is displayed in Figure 2. To give an easy to follow overview of the proposed process model, its graphical depiction is heavily leaning onto the BPMN notation. However, this also introduces some constraints. The levels of the process are depicted as separate BPMN pools.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+A Process Model for Test Driven Development in the Big Data Domain
+
+Figure 2: Process Model for Test Driven Development in the Big Data Domain.
+While this slightly deviates from the idea behind the differ from other development contexts, so that a concept of pools in BPMN, it increases visual clarity specific description is not necessary.
+and was therefore implemented. Since the test driven Another aspect that is highly important but not implementation of the microservice is depicted as one directly covered by the process model is the selection step and not further broken down, there are only three of tools and technologies. While the modular nature levels shown, with the method level being omitted. of the MBTDD-BD allows for a high degree of
+Furthermore, especially in larger projects, it is flexibility and gives the developers the choice, which likely that several teams work in parallel, whereas the programming languages, frameworks or existing depicted process presents a linear sequence. This is solutions they want to use, respectively incorporate, also for the sake of visual clarity. However, in reality, there is no support provided for those decisions. Since there might be several microservices (also from there is a plethora of available options, this task can, different components) be worked on at the same time. however, also be highly challenging. While there are Yet, this does not crucially affect the actual flow, already existing works that focus on a general wherefore it is only mentioned but not graphically decision support for the technology selection in BD represented. Additionally, the outlined process refers projects (Volk et al. 2020a), additional material that to projects that are created from scratch. If an is geared towards this specific situation might be application that was built according to the proposed helpful for prospective developers and, hence, also procedure shall be modified, the already existing tests help to facilitate the dissemination of TDD in the BD can be utilized. Changes on any other pre-existing domain in general.
+systems are out of scope of the proposed process Additionally, as previously mentioned, the model and individual approaches have to be found. proposed model slightly simplifies the development process by presenting it as a sequential flow. While is
+reality, several teams might work in parallel on 5 DISCUSSION AND FUTURE several services, the increased comprehensibility was deemed worth it to accept that slight simplification as
+WORK a trade-off. When applying the model in a parallel scenario, it is therefore necessary to account for this
+With the steady increase of the number of BD decision and adjust the actual workflow accordingly. applications that are being used and their quality Further, the model only outlines which actions assurance being one of the major challenges should be taken in which order, but not by whom. (Staegemann et al. 2019), finding ways to tackle that Even though the specifics of this decision obviously issue is highly important. While the MBTDD-BD heavily depend on the structures of the organizations approach seems generally promising to increase the and teams that are involved, the identification of best quality as well as the modifiability of the developed practices and recommendations could still prove to be systems, up to now, there was no structured procedure valuable support. Therefore, this might be a for its application. The proposed process model is worthwhile task for future researchers that has strong directed towards bridging this gap. By following the practical implications.
+comprehensive sequence of steps, the necessary Since the quality of big data applications heavily activities can be covered, while also assuring that the depends on the correct architectural choices (Ataei order is actually sensible and corresponds to the spirit and Litchfield 2020) and there are numerous patterns of the TDD methodology. proposed for the implementation of microservices, it
+However, several factors have to be taken into also appears reasonable to regard those two aspects in account. The first aspect is that the requirements for context of each other to determine, which the system are taken for granted. While this makes microservice patterns are best suited to deal with sense for the aspired scope, they are extremely certain challenges of big data development and the important for the success of an implementation underlying big data characteristics.
+project. Therefore, it is mandatory to find a suitable
+approach for their collection. This also means that the
+proposed process model cannot be seen as a panacea 6 CONCLUSION
+but has to be used in conjunction with other suitable
+methods. To a lesser degree this also applies to the
+test driven implementation of the distinct Banigd daaptpal iacnadti otnhse choarvree speomnedrignegd totoo lsb, et eochnne oloofg itehse, microservices not being described in detail. However, driving factors of today’s society. Countless
+on this level, the development does not crucially
+
+Figure 3: The DSR Grid for the Presented Work.
+organizations from numerous domains rely on the endeavor in its entirety is given in Figure 3, in the form ability to utilize information to an unprecedented of the DSR Grid (Vom Brocke and Maedche 2019). extent to improve their inherent processes and
+decision making, and, thereby, inter alia, reduce their
+costs, increase their productivity, strengthen their REFERENCES
+marketing, support their maintenance, improve their
+logistics, or identify new opportunities. However, the
+implementation of those systems is a highly Abd“aTlloawh,a rMds., a HDaamtam Caodll, ecAti.,o na nQdu aAlilt-yZ Myaoddaetl, fWor .B (i2g0 D2a2t)a. challenging and error-prone task, while at the same Applications,” in Business Information Systems
+time their quality is crucial for the successful use. Workshops, W. Abramowicz, S. Auer and M. Stróżyna Therefore, their quality assurance is very important. (eds.), Cham: Springer International Publishing, pp. Yet, this domain is still far from being mature. 103-108 (doi: 10.1007/978-3-031-04216-4_11). Therefore, further work in this field is needed. This Al-Mekhlal, M., and Khwaja, A. A. (2019). “A Synthesis includes the improvement of existing approaches and of Big Data Definition and Characteristics,” in strategies as well as the exploration of new ones. One Proceedings of the 2019 IEEE International rather recent proposition was the application of test Conference on Computational Science and Engineering driven development to the implementation of big data (ECmSbEe)d deadn da ndI EUEbEi quIintoteursn aCtioomnpaul tinCgo n(EfeUreCn)c,e Neown
+systems. However, it was not outlined how the York, NY, USA. 01.08.2019 - 03.08.2019, IEEE, pp. corresponding process should be designed. 314-322 (doi: 10.1109/CSE/EUC.2019.00067).
+The publication at hand bridges this gap and Ataei, P., and Litchfield, A. (2020). “Big Data Reference provides developers that are interested in the Architectures, a systematic literature review,” in application of TDD in the BD domain with a process Australasian Conference on Information Systems model that outlines, which activities should be (ACIS) 2020, Wellington, New Zealand, AIS. performed in which order and, therefore, helps in Bahri, S., Zoghlami, N., Abed, M., and Tavares, J. M. R. S. structuring the implementation process. Thereby, it (A2c0c1e9s)s. “BIG( D7)A, TA foprp H. ealthcare: A Survey,” I(EdEoEi: helps in disseminating the general approach, 10.1109/ACCESS.2018.28891807).3 97-7408
+facilitates its effective utilization, promotes a stronger Beck, K. (2015). Test-Driven Development: By Example, focus on the topic of quality assurance, and can be Boston: Addison-Wesley.
+used as a foundation to advance the scientific Bronson, K., and Knezevic, I. (2016). “Big Data in food and discourse in the domain. An overview of the research agriculture,” Big Data & Society (3:1) (doi:
+10.1177/2053951716648174).
+Chang, W. L., and Grady, N. (2019). “NIST Big Data Hevner, A. R., March, S. T., Park, J., and Ram, S. (2004).
+Interoperability Framework: Volume 1, Definitions,” “Design science in information systems research,” MIS Special Publication (NIST SP), Gaithersburg, MD: quarterly, pp. 75-105.
+National Institute of Standards and Technology. ISO. (2018). “International Standard ISO / IEC / IEEE Crispin, L. (2006). “Driving Software Quality: How Test- 29148 Systems and Software Engineering — Life
+Driven Development Impacts Software Quality,” IEEE Cycle process - Requirements Engineering,”
+Software (23:6), pp. 70-71 (doi: 10.1109/MS.2006.157). ISO/IEC/IEEE 29148:2018.
+Davies, K., Keet, C. M., and Lawrynowicz, A. (2019). Janzen, D., and Saiedian, H. (2005). “Test-driven
+“More Effective Ontology Authoring with Test-Driven development concepts, taxonomy, and future direction,”
+Development and the TDDonto2 Tool,” International Computer (38:9), pp. 43-50 (doi:
+Journal on Artificial Intelligence Tools (28:7) (doi: 10.1109/MC.2005.314).
+10.1142/S0218213019500234). Karlesky, M., Williams, G., Bereza, W., and Fletcher, M. Diebold, F. X. (2012). “On the Origin(s) and Development (2007). “Mocking the Embedded World: Test-Driven
+of the Term 'Big Data',” SSRN Electronic Journal (doi: Development, Continuous Integration, and Design
+10.2139/ssrn.2152421). Patterns,” in Embedded Systems Conference, San Jose, Dobre, C., and Xhafa, F. (2014). “Intelligent services for California, USA. 01.04.2007 - 05.04.2007, UBM
+Big Data science,” Future Generation Computer Electronics.
+Systems (37), pp. 267-281 (doi: Keet, C. M., and Ławrynowicz, A. (2016). “Test-Driven
+10.1016/j.future.2013.07.014). Development of Ontologies,” in The Semantic Web. Faitelson, D., Heinrich, R., and Tyszberowicz, S. (2018). Latest Advances and New Domains, H. Sack, E.
+“Functional Decomposition for Software Architecture Blomqvist, M. d'Aquin, C. Ghidini, S. P. Ponzetto and
+Evolution,” in Model-Driven Engineering and Software C. Lange (eds.), Cham: Springer International
+Development, L. F. Pires, S. Hammoudi and B. Selic Publishing, pp. 642-657 (doi: 10.1007/978-3-319-
+(eds.), Cham: Springer International Publishing, pp. 34129-3_39).
+377-400 (doi: 10.1007/978-3-319-94764-8_16). Krylovskiy, A., Jahn, M., and Patti, E. (2015). “Designing Freymann, A., Maier, F., Schaefer, K., and Böhnel, T. a Smart City Internet of Things Platform with
+(2020). “Tackling the Six Fundamental Challenges of Microservice Architecture,” in Proceedings of the 2015
+Big Data in Research Projects by Utilizing a Scalable 3rd International Conference on Future Internet of
+and Modular Architecture,” in Proceedings of the 5th Things and Cloud (FiCloud 2015), I. Awan (ed.), Rome,
+International Conference on Internet of Things, Big Italy. 24.08.2015 - 26.08.2015, Piscataway, NJ: IEEE,
+Data and Security, Prague, Czech Republic. 07.05.2020 pp. 25-30 (doi: 10.1109/FiCloud.2015.55).
+- 09.05.2020, SCITEPRESS - Science and Technology Kum, W., and Law, A. (2006). “Learning Effective Test Publications, pp. 249-256 (doi: Driven Development - Software Development Projects 10.5220/0009388602490256). in an Energy Company,” in Proceedings of the First
+Fucci, D., Erdogmus, H., Turhan, B., Oivo, M., and Juristo, International Conference on Software and Data
+N. (2017). “A Dissection of the Test-Driven Technologies, Setúbal, Portugal. 11.09.2006 - Development Process: Does It Really Matter to Test- 14.09.2006, SciTePress - Science and and Technology First or to Test-Last?” IEEE Transactions on Software Publications, pp. 159-164 (doi: Engineering (43:7), pp. 597-614 (doi: 10.5220/0001316101590164). 10.1109/tse.2016.2616877). Levin, I., and Mamlok, D. (2021). “Culture and Society in Goes, F. R., Meerhoff, L. A., Bueno, M. J. O., Rodrigues, the Digital Age,” Information (12:2), p. 68 (doi:
+D. M., Moura, F. A., Brink, M. S., Elferink-Gemser, M. 10.3390/info12020068).
+T., Knobbe, A. J., Cunha, S. A., Torres, R. S., and Müller, O., Fay, M., and Vom Brocke, J. (2018). “The Lemmink, K. A. P. M. (2020). “Unlocking the potential Effect of Big Data and Analytics on Firm Performance: of big data to support tactical performance analysis in An Econometric Analysis Considering Industry professional soccer: A systematic review,” European Characteristics,” Journal of Management Information journal of sport science, pp. 1-16 (doi: Systems (35:2), pp. 488-509 (doi: 10.1080/17461391.2020.1747552). 10.1080/07421222.2018.1451955).
+Günther, W. A., Rezazade Mehrizi, M. H., Huysman, M., Nadareishvili, I., Mitra, R., McLarty, M., and Amundsen,
+and Feldberg, F. (2017). “Debating big data: A M. (2016). Microservice architecture: Aligning literature review on realizing value from big data,” The principles, practices, and culture, Beijing, Boston, Journal of Strategic Information Systems (26:3), pp. Farnham, Sebastopol, Tokyo: O´Reilly.
+191-209 (doi: 10.1016/j.jsis.2017.07.003). Nagorny, K., Lima-Monteiro, P., Barata, J., and Colombo, Häusler, R., Staegemann, D., Volk, M., Bosse, S., Bekel, C., A. W. (2017). “Big Data Analysis in Smart
+and Turowski, K. (2020). “Generating Content- Manufacturing: A Review,” International Journal of
+Compliant Training Data in Big Data Education,” in Communications, Network and System Sciences (10:03),
+Proceedings of the 12th CSEdu, Prague, Czech pp. 31-58 (doi: 10.4236/ijcns.2017.103003).
+Republic. 02.05.2020 - 04.05.2020, SCITEPRESS - NASA. (2019). “Mars Climate Orbiter,” available at Science and Technology Publications, pp. 104-110 https://solarsystem.nasa.gov/missions/mars-climate- (doi: 10.5220/0009513801040110). orbiter/in-depth/, accessed on Feb 27 2022.
+Peffers, K., Tuunanen, T., Rothenberger, M. A., and Staegemann, D., Volk, M., Saxena, A., Pohl, M., Nahhas,
+Chatterjee, S. (2007). “A Design Science Research A., Häusler, R., Abdallah, M., Bosse, S., Jamous, N.,
+Methodology for Information Systems Research,” and Turowski, K. (2021b). “Challenges in Data
+Journal of Management Information Systems (24:3), pp. Acquisition and Management in Big Data
+45-77 (doi: 10.2753/MIS0742-1222240302). Environments,” in Proceedings of the 6th International Sangwan, R. S., and Laplante, P. A. (2006). “Test-Driven Conference on Internet of Things, Big Data and
+Development in Large Projects,” IT Professional (8:5), Security, Prague,Czech/Online Streaming. 23.04.2021 -
+pp. 25-29 (doi: 10.1109/MITP.2006.122). 25.04.2021, SCITEPRESS - Science and Technology Shahin, M., Ali Babar, M., and Zhu, L. (2017). “Continuous Publications, pp. 193-204 (doi:
+Integration, Delivery and Deployment: A Systematic 10.5220/0010429001930204).
+Review on Approaches, Tools, Challenges and Staegemann, D., Volk, M., and Turowski, K. (2021c).
+Practices,” IEEE Access (5), pp. 3909-3943 (doi: “Quality Assurance in Big Data Engineering - A
+10.1109/ACCESS.2017.2685629). Metareview,” Complex Systems Informatics and Shakir, A., Staegemann, D., Volk, M., Jamous, N., and Modeling Quarterly (28), pp. 1-14 (doi:
+Turowski, K. (2021). “Towards a Concept for Building 10.7250/csimq.2021-28.01).
+a Big Data Architecture with Microservices,” in Staegemann, D., Volk, M., and Turowski, K. (2022b).
+Proceedings of the 24th International Conference on “Adapting the (Big) Data Science Engineering Process
+Business Information Systems, Hannover, to the Application of Test Driven Development,” in
+Germany/virtual. 14.06.2021 - 17.06.2021, pp. 83-94 Proceedings of the 19th International Conference on
+(doi: 10.52825/bis.v1i.67). Smart Business Technologies, Lisbon, Portugal. Shull, F., Melnik, G., Turhan, B., Layman, L., Diep, M., 14.07.2022 - 16.07.2022, SCITEPRESS - Science and
+and Erdogmus, H. (2010). “What Do We Know about Technology Publications, pp. 120-129 (doi:
+Test-Driven Development?” IEEE Software (27:6), pp. 10.5220/0011289200003280).
+16-19 (doi: 10.1109/MS.2010.152). Statista. (2021). “Volume of data/information created, Silva, E. S., Hassani, H., and Madsen, D. Ø. (2019). “Big captured, copied, and consumed worldwide from 2010
+Data in fashion: transforming the retail sector,” Journal to 2025,” available at
+of Business Strategy (41:4), pp. 21-27 (doi: https://www.statista.com/statistics/ 871513/worldwide-
+10.1108/JBS-04-2019-0062). data-created/, accessed on Feb 13 2022.
+Slaats, T., Debois, S., and Hildebrandt, T. (2018). “Open to Volk, M., Staegemann, D., Bosse, S., Nahhas, A., and
+Change: A Theory for Iterative Test-Driven Modelling,” Turowski, K. (2020a). “Towards a Decision Support in Business Process Management, M. Weske, M. System for Big Data Projects,” in WI2020 Zentrale Montali, I. Weber and J. Vom Brocke (eds.), Cham: Tracks, N. Gronau, M. Heine, K. Poustcchi and H. Springer International Publishing, pp. 31-47 (doi: Krasnova (eds.), GITO Verlag, pp. 357-368 (doi: 10.1007/978-3-319-98648-7_3). 10.30844/wi_2020_c11-volk).
+Sommerville, I. (2007). Software Engineering, eighth Volk, M., Staegemann, D., Pohl, M., and Turowski, K.
+edition, Addison-Wesley. (2019). “Challenging Big Data Engineering: Staegemann, D., Feuersenger, H., Volk, M., Liedtke, P., Positioning of Current and Future Development,” in
+Arndt, H.-K., and Turowski, K. (2022a). “Investigating Proceedings of the 4th International Conference on
+the Incorporation of Big Data in Management Internet of Things, Big Data and Security, Heraklion,
+Information Systems,” in Business Information Systems Crete, Greece. 02.05.2019 - 04.05.2019, SCITEPRESS
+Workshops, W. Abramowicz, S. Auer and M. Stróżyna - Science and Technology Publications, pp. 351-358
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+119
diff --git a/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt b/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt
new file mode 100644
index 0000000..f0d175c
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt
@@ -0,0 +1,127 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+SYNTHETIC FLIGHT TEST DATA FOR BIG DATA
+COMPUTING
+Bob Baggerman
+Avionics Test and Analysis Corp (ATAC) 4540 East Highway 20
+Niceville, FL 32578
+bob.baggerman@avtest.com
+ABSTRACT
+There is currently quite a bit of development taking place within the DoD flight test range community in “Big Data” computing. A problem plaguing development is a lack of suitable data sets for development and test of software analysis tools. Most actual flight test data has restricted distribution and so isn't available for many developers. Also, it can be difficult to find actual recorded flight test data which have “interesting” properties such as specific flight profiles and events.
+Synthesized IRIG 106 Chapter 10 format flight test data solves these problems by providing data files to developers that are very similar to what might be expected from an actual flight test. Synthetic data files are complete and properly formed data files that contain fake but realistic flight test data as if it had been recorded during an actual flight test. The data in these data files is designed to provide interesting test cases for software tool developers to use.
+INTRODUCTION
+The Department of Defense (DoD) has been pursuing cloud based storage and processing solutions for flight test data. Storing and processing flight test data in the cloud is a fundamentally different kind of processing environment that will require new software tools and techniques to be developed. Development of these new analysis software tools and techniques requires test data that isn’t readily available to developers. Software tools for creating carefully crafted synthesized (i.e. synthetic) data files have been developed to create useful synthetic flight test data sets.
+Big Data is typically defined by the three “V”s, volume, velocity, and variability. The volume of data refers to data sets that are too large to be processed and viewed all at once on a single computer. The velocity of data refers to the speed at which data is coming in and must be processed. The variability of data refers to the wide assortment of data sources and formats to consider. Current modern flight test programs certainly strain under volume and velocity constraints. For most DoD flight test programs the bulk of the recorded data is in IRIG 106 Chapter 10 format.
+Up until recently flight test data analysis has primarily involved the analysis of single or a small number of recorded flight test data files. There are numerous applications that will read, interpret, and display recorded data from a single flight test. Cloud based computing will allow new, more sophisticated types of analysis to be done. For the first time “big data” kinds of analysis can be performed on a large number of data sets.
+Whereas up until now flight test data analysis addressed question of how a system under test performed in the most recent flight test, cloud-based big data analytics (BDA) analytics allow more sophisticated analysis across multiple data set. Below are several examples of types of analytics that could be accomplished in a cloud based BDA environment.
+As we consider synthetic data it is important to keep in mind that the System Under Test (SUT) is the Big Data Analytics platform. These synthetic data sets are to support BDA development and software test.
+EXAMPLES OF BIG DATA ANALYSIS
+Nominal Flight Path Calculation
+Consider an instrument approach flown to 32 at China Lake Naval Air Weapons Station (NAWS) airport. This approach is depicted in Figure 1 below. When flying this approach it is important to pass the final approach fix KATIE at or above 4400’. Interesting analysis questions might be “what is the average altitude error and standard deviation over the final approach fix (FAF)” or “what flights were more than 3 Standard Deviations from the correct Altitude at the FAF?”
+Synthetic data with the necessary variability can be easily generated to support development of this kind of analysis.
+
+
+Figure 1 - Example flight path for approach
+Flight Segments for Analysis
+Next consider the need to identify flight paths for various test runs as shown in Figure 2 below. To measure system the performance of an aircraft system under test (for example a targeting system) it is necessary to identify segments of flight test data that demonstrate performance. An interesting analysis question would be “what flight segments were flown on the test range on headings from 180 degrees to 270 degrees between 3000’ and 6000’ feet altitude MSL within a given latitude and longitude box?” The ability to describe flight segments of interest and then find them in a large set of recorded data files allows regression analysis over the evolution of the system.
+Carefully crafted synthetic data as shown in Figure 2 supports development of this kind of data search.
+
+
+
+
+
+Figure 2 - Example flight path segments
+Flight Segments for EW analysis
+Lastly consider the case for Radar Warning Receiver (RWR) testing as shown in Figure 3 below. RWR testing typically involves many test runs over multiple flights. To measure system performance improvements test analysis may be performed for flight test performed over a period of months or years. An interesting analysis questions would be “What flight segments were flown on a particular range between 5/1/2020 and 5/14/2020 where the RWR detected a particular radar threat?” and “What was the Average and Standard Deviation of Detection Range to the Target?”
+Synthetic data with the necessary flight paths and simulated radar threat responses can be easily generated to support development of this kind of analysis.
+
+
+
+
+Figure 3 - Example flight path segments for radar test
+Each of these example analysis scenarios described above necessitate sample data to test against. Currently developers lack realistic data set to develop with for two reasons,
+1) Most actual flight test data is restricted distribution in some fashion. Most of it is classified at some level but even most unclassified data is at least Controlled Unclassified Information (CUI) with limited distribution. Development teams lack people and facilities with the appropriate access to controlled data.
+2) Existing real world data sets lack “interesting” features for developers to test search and analyze algorithms. Most actual flight test data does not present good test cases for software development, test, and validation.
+Synthetic flight test data solves these problems by providing data that has unrestricted distribution and is well crafted to provide useful test cases.
+TYPES OF SYNTHETIC DATA
+In the analysis examples discussed above it is necessary to have very specific data sets to test and validate new analysis software. Because of this synthetic data is synthesized several different ways depending on the purpose of the underlying test.
+Contrived Data – This data is unrealistic flight test data but instead presents data types and values useful for testing correct decoding and conversion of IRIG 106 values. For example, a flight data file with ARINC 429 data has recently been created with integer and floating point values. Messages with minimum values, maximum values, specific positive values, specific negative values, and zero values were created to verify correct decoding.
+Synthesized Data – This data attempts to mimic realistic flight test data but with very controlled flight conditions. For example, a flight data file with aircraft navigation MIL-STD-1553 data messages derived from an aircraft simulation software program has been created. This flight data file is completely software created but realistically mimics the position, attitude, and speed of an actual test aircraft flying a typical mission on a test range with specific altitude, speed, and heading parameters.
+Repurposed Data – This data recasts previously recorded flight data into IRIG 106 format. NASA had a program to record flight data on regional commercial jets. There are data files for about 220,000 over several years. Each flight data file records over 150 different flight parameters useful for including in derived IRIG 106 format data files for big data analytics.
+Other data sources for this effort were also considered. The FAA Automatic Dependent Surveillance–Broadcast (ADS-B) as a source for real-time actual flight data was considered but ADS-B is limited in the number of flight parameters available. Flight data from a computer based flight simulator such as X-Plane and Microsoft Flight simulator was considered but these operate in real time and would take a considerable amount of effort for a human to fly a large number of flight scenarios to support all the flight data files necessary for BDA. Lastly there are also some unclassified sources of actual flight test data but the amount of data and efficacy is limited.
+SYNTHETIC FLIGHT TEST DATA GENERATION
+Various software applications have been written for generating each of the different types of synthetic data described above. In each case there is a source of “truth” data which is then processed to generate IRIG 106 Chapter 10 data files for test.
+Contrived Data
+Contrived data is not realistic data but instead contains very specific data fields. In the case of contrived data the contents of the resultant Chapter 10 data file are specified in minute detail.
+Contrived data is generated from a content definition data file. The content definition data file contents are written by hand in XML format. Although being laborious, usually only a few well- crafted data types and fields are necessary to validate a software data decoder or processor. The IRIG 106 Chapter 10 Programming Handbook (RCC Document 123-16) Appendix P “XML Mapping” provides the information and definition of the data file contents in XML format.
+An example of a contrived dataset definition is shown in Figure 4 below. In this example ARINC 429 data messages were defined in various formats including signed and unsigned integer with minimum, maximum, and zero values.
+Once an appropriate XML content definition data file has been authored, the XML is converted into a Chapter 10 format data file using the FLIDAS software application from Data Bus Tools GmbH.
+
+Synthesized Data
+In the case of synthesized data the contents of the resultant Chapter 10 data file are derived from pre-calculated aircraft state data. The goal of the pre-calculated aircraft state data is to provide aircraft state that is both realistic, deterministic, and carefully controlled. The Government Off the Shelf (GOTS) BlueMax6 simulation software available from DSIAC is used to pre-calculate realistic simulated flight data based on a provide detailed input scenario file.
+BlueMax6 calculates realistic aircraft dynamic state based on an input scenario file. This scenario file describes the desired flight path at a high level of abstraction. The aircraft type and some initial information such as initial position, heading and speed are first specified. Then the flight path is defined as a series of various types of waypoints and maneuvers, eventually ending in a landing maneuver. A portion of an example scenario file is shown in Figure 5. The flight path shown in Figure 2 was generated from a BlueMax6 scenario.
+BlueMaxRunTitle A-10 China Lake Echo Range Aircraft A-10A
+CallSign FOLK1
+EntityID 0:0:0:0
+ZuluTime 00:00:00.00
+DtedTerrain On
+InitialPitch 0
+InitialPositionLL 35.6959:N 117.6915:W InitialAltitudeMSLf 2110
+InitialTrueHeading 154.5
+InitialAirspeedKtas 50
+InitialThroPosition Auto
+InitialGearPosition Down
+OutputFileName A-10__China_Lake__Echo_Range__ OutputRateSec 0.04
+ManeuverLimits Autopilot AutopilotMaxRoll 45 AutopilotMinPitch -10 AutopilotMaxPitch +25
+CmdAltitudeMSLf 2300 CmdGearPosition 2200 CmdAirspeedMach BestRateOfClimb CmdFlapPosition Auto CmdSegmentEndMode Acquisition CmdFlySegment
+WriteMessage Low Pass Takeoff CmdTrueHeading 154.5 CmdGroundRangeNm 2 CmdAltitudeMSLf 2300 CmdThroPosition 300 CmdFlapPosition 0 CmdSlatPosition 0 CmdFlySegment
+WriteMessage China Lake Skytop CmdWaypointLL 35.700833:N 117.499167:W CmdWaypointNavMode Direct CmdAltitudeMSLf 6000
+CmdAirspeedKtas 300
+CmdFlySegment
+Figure 5 – Example BlueMax6 scenario file.
+BlueMax6 generates an output file with calculated values of aircraft state at regular time intervals. For most synthesized data runs a time step of 40 msec (50 Hz) is chosen. BlueMax6 currently has 497 different aircraft state values available for output. Besides aircraft attitude, position, velocities, and accelerations other values such as throttle position, landing position, and others are also output and used in the synthesized flight data file.
+To convert BlueMax6 output files to Chapter 10 data files several conversion software programs have been developed. Each software program written is a command line console application written in C++. The current software is targeted for the Windows environment but is sufficiently generic that it could be easily ported to other operating systems such as Linux. The source code for these software programs are readily available from github.
+There are two approaches to generating Chaptert 10 files from BlueMax6 data. In the direct conversion approach BlueMax6 data is read and directly converted into a Chapter 10 data file. This data file includes synthesized data in MIL-STD-1553, Pulse Code Modulation (PCM), and ARINC-429 data types.
+When video is to be included in the Chapter 10 file a second conversion approach is used. When video is to be generated BlueMax6 data is first read and stored in a SQLite database. A playback application is used to read navigation data from the database, send aircraft position and attitude data to the X-Plane flight simulator application, and for each navigation point perform a screen capture. Each screen capture is then processed by the ffmpeg digital video encoder library and converted into an MPEG Transport Stream (TS) series of video packets. These TS video packets are then stored back in the SQLite database. This process is repeated for each channel of video desired. This process is depicted in Figure 6.
+Video generation is currently a very slow process. With current desktop hardware and a software- only encoder it runs at about one-half real time. For this reason video isn’t necessarily generated for synthesized data sets. From a test and software validation standpoint video data is usually of limited utility.
+Once BlueMax6 data has been stored in the SQLite database along with optional video it is processed and converted into a Chapter 10 data file. This process is depicted in Figure 7. The conversion software is a simple fixed time slice simulation engine. Data is read periodically from the SQLite database and stored in a state variable matrix, various simulation modules such as those used to generate navigation data use and add to the state variable matrix, and data formatter modules are used to synthesize and write the output Chapter 10 data.
+
+Figure 6 – Preprocessing and synthetic video generation
+
+Figure 7 – Synthetic Chapter 10 data file generation
+Repurposed Data
+In the early 2000’s NASA had a program to record and make generally available flight data from a number of commercial regional jets. Flight data was recorded onboard a single type of regional jet operating in commercial service over a three-year period. NASA makes this data available on their DASHlink website.
+The recorded data includes 186 flight parameters. Detailed aircraft dynamics, system performance, and other engineering parameters are included. Data files for over 220,000 flights were recorded and are available. Figure 8 shows a set recorded flight paths. Figure 9 show a set of recorded flight paths in the vicinity of Detroit’s Wayne County airport.
+Although the NASA recorded data sets aren’t carefully controlled, the large number of recorded flights flying on regular routes makes this data set useful for testing big data types of analysis.
+
+
+
+
+
+
+
+
+
+
+Figure 8 – Example of NASA recorded flights across the country
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Figure 9 – Example of NASA recorded flights near Detroit
+NASA makes these data files available in Matlab format. A python script was written to convert these Matlab format files into Comma Separated Value (CSV) format files for later processing. After conversion to CSV format, conversion to Chapter 10 format is accomplished in the same manner as conversion from BlueMax6 data previously shown in Figure 6 and Figure 7.
+CONCLUSIONS
+The DoD move to cloud computing is enabling development of Big Data Analytics capabilities. Development of new software tools and techniques will require large quantities of data and especially data with interesting features. Synthesized flight test data may be the only practical way to provide the quantities and types of data necessary for software development.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt b/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt
new file mode 100644
index 0000000..526a3ad
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt
@@ -0,0 +1,176 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2021 7th International Symposium on System and Software Reliability (ISSSR)
+Big Data-based Testing: Characteristics, Challenges, and Future Directions
+ Pan Liu Yihao Li
+Faculty of Business Information School of Information and Electrical Engineering Shanghai Business School, Shanghai, China Ludong University, Yantai, China
+panl008@163.com yihao.li@ldu.edu.cn
+Lian Zeng Xuankui Zheng Sihao Huang
+Shanghai Business School Shanghai Business School Shanghai Business School
+18786201272@163.com 1079737114@qq.com 1160114530@qq.com
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract—With the rise of the applications of the Internet of Things (IoT) in human society, how to ensure the reliability of IoT systems has become a research hotspot. Generally, there are complex interactions between multiple systems in IoT. Therefore, even if a single system can pass rigorous tests, it may not be able to guarantee that the system runs reliably in a complex IoT environment. With the operation of the IoT system, a large amount of data will be generated to record sensor data, system operations, user’s operations, and other information. Therefore, software faults or software design defects can be discovered if we use appropriate big data technology to mine the massive amount of data. The paper states the characteristics of big data-based testing and compares this test method with traditional software test methods in the software life cycle. Then, the paper discusses the challenges of applying big data-based testing to IoT systems. Finally, some future research directions of big data-based testing are given in the paper.
+Keywords: big data-based testing; big data technology; system reliability; IoT systems
+I. INTRODUCTION
+With the advent of the IoT era, more and more large- scale systems related to the national economy and people's livelihood, such as power operation system, rail transit system, and aerospace system, have been connected to the network, and software has become a key to the normal operation of IoT. However, frequent software failures have caused the problem of "trustworthy crisis" [1-3] in software. For example, due to a line of code error, the blockchain project YAM worth 500 million dollars https://news.bitcoin.com/new-defi-yield-farming-project-yam- finance-sees-460-million-locked-in-17-hours/
+2 https://www.space.com/china-far-side-moon-rover-strange- substance.html
+978-1-6654-3431-7/21/$31.00 ©2021 IEEE 44
+DOI 10.1109/ISSSR53171.2021.00012
+ was closed on August 12, 2020. Because of insufficient testing, the SpaceX rocket of the US Space Exploration Technology Company exploded when it was returned on the ground on February 2, 2021 [4]. Therefore, once the IoT system runs incorrectly or is maliciously manipulated, the consequences will be unimaginable.
+In the past, software testing is an effective way to detect software faults and improve software quality [5]. However, IoT systems often run in an extremely complex environment. Thus, it is an impossible task to test them completely. For example, due to the harsh space environment on the moon,
+China’s Yutu lunar2 rover was paralyzed on the lunar surface after less than two months of operations. This indicates that the previous software and hardware test for Yutu lunar rover was insufficient. In addition, one IoT system often has complex interactions with other IoT systems. If we stop a running IoT system and test it, it is likely to affect the normal operation of other IoT systems, resulting in huge economic losses. However, the traditional software testing methods, such as unit testing, integration testing, system testing, and acceptance testing, are difficult to effectively solve the above two problems because it is impossible to exhaustively test IoT systems. Therefore, industry and academia urgently need to study new methods of software testing to improve the quality of IoT systems.
+Recently, some scholars proposed a novel software testing method based on big data technology [6-8]. This testing method lies on the emphasis of the analysis of software running logs [9,10] or user operation data recorded by the software to detect software faults or software design defects. As the running time of the software increases, the system logs or the data recorded by the system will contain a large number of system operation information. If we regard these massive operations on the system as the software testing process, the system has already completed the massive testing, and software faults and software design defects must be recorded in the data. Therefore, these faults and defects can be detected from the data if big data mining techniques are effective. This test method is also suitable for detecting software faults and design defects of IoT systems. First of all, the IoT system will generate a large amount of data, such as sensor data, system logs, and system forum data. By mining these data, we can detect software faults and software design defects. For example, we have realized the performance test of the networking efficiency of apps and found a small number of network failure events of WeChat by analyzing its networking data [11]. Secondly, the operation of the IoT system can be optimized according to the result of data analysis. For example, Al-Ali et. al [12] improved the smart home management system through the big data analysis of the smart home, and improved the user’s experience of the smart home.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+The paper discusses big data-based testing, and compares this test method with traditional software testing methods in the software life cycle. Then, we also discuss the challenges of applying big data-based testing to ensure the reliability of IOT systems. Finally, some future research directions for big data-based testing are given to ensure the reliability of IoT systems.
+The contributions of the paper include:
+(1) We discussed the evolution of the software life cycle and the relationship between traditional software testing methods and big data-based testing. Then, we constructed four models to describe the evolution process of the software life cycle.
+(2) We summarized the three challenges of big data- based testing to ensure the reliability of IoT systems.
+(3) We presented five future research directions for big data-based testing.
+II. BIG DATA-BASED TESTING
+A. Software Life Cycle
+software release phase, software maintenance and update phase, and software obsolescence phase, as shown in Fig. 1 (a). From Fig. 1 (a), software development is accompanied by software testing in the past. If we consider iteration of software multiple versions, software life cycle can be represented by the model in Fig. 1 (b). If we consider the interaction between users and software, software life cycle can be described by the model in Fig. 1 (c). After using the software, users will put forward some suggestions for the improvement of the software according to their own habits. Programmers can update the software according to these user requirements, and then the next software version will be released. However, there are two difficulties in achieving the above process. First, not all users of software can express clearly what software requirements need to be improved. Second, users of the software may not be able to observe all software faults and software design defects. Therefore, we need to study the new and non-manually method to generate the software update requirement report.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+45
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+Generally, software life cycle [13,14] can be arbitrarily divided into software development and testing phase,
+(a) software development maintenance and software
+software Release
+and testing upgrade obsolescence
+iteration evolution
+(b) software development maintenance and software
+version Release
+and testing upgrade obsolescence
+iteration evolution
+(c) software development software upgrade software
+version Release customer use
+and testing requirement obsolescence
+iteration evolution
+(d) software development software upgrade software
+version Release customer use
+and testing requirement obsolescence
+big data fault and defect
+data collection
+analysis mining
+Figure 1. Four models for describing the evolution of the software life cycle
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+Because an amount of data is generated from the IoT system, we can collect them and use big data technology to deal with them. Thus, it is possible to dig out software faults and software design defects from the data. We can construct a new model shown in Fig .1 (d) to describe the software life cycle. From Fig. 1 (d), data collection, big data analysis, and data mining are used to detect software faults and software design defects so as to generate the software update report. The test method is called big data-based testing. Its core idea is to use big data technology to mine software faults and software design defects that are not found by traditional software testing methods in the software life cycle.
+Note: in practice, big data-based testing cannot replace those traditional software testing methods. Even if software faults and software design defects are detected, software testers still need to use some traditional software testing methods to fix them.
+B. Characteristics
+Comparing to traditional software testing methods, big data-based testing has the following characteristics:
+(1) Big data-based testing is implemented after the software is released.
+(2) Big data-based testing does not require testers to design and execute test cases, but to detect software faults and design defects by collecting and analyzing data. Therefore, the cost of software testing is saved.
+(3) Big data-based testing is a data-driven testing method, that is, this testing method depends on the availability of the data generated by the software and the effectiveness of the data acquisition, filtering and analysis methods.
+(4) After software faults are detected by big data-based testing, the traditional software testing methods also need to be used to fix software faults and software design defects.
+(5) Big data-based testing can not only find software faults, but also detect software design defects, which is difficult to achieve by traditional software testing methods.
+C. Comparison
+The relationship between traditional software testing methods and big data-based testing is shown in Fig. 2. From Fig. 2, traditional software testing methods and big data- based testing are both part of the software life cycle. Traditional software testing methods are completed before the software is officially released, while big data-based testing is completed after the software is released. Therefore, both traditional software testing methods and big data testing realize the whole process testing of the software life cycle.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+46
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+traditional software testing methods big data-based testing
+software
+testers test cases test execution life cycle data collection data analysis
+bug fix fault and defect mining
+before software release after software release
+Figure 2. The relationship between traditional software testing methods and big data-based testing
+
+Item
Traditional software testing methods
Big data-based testing
bug fix
yes
no
software design defect
no
yes
Table 1 shows the difference between traditional software testing methods and big data-based software testing. From Table 1, traditional software testing methods are to find software bugs by executing test cases. Therefore, these test methods usually require testers to design test cases and execute test cases. Compared with traditional software testing methods, big data-based software testing requires data analysts to collect data, analyze data, and mine software faults and defects in software design. In addition, both traditional software testing methods and big data-based testing can detect software faults. Traditional software testing methods can fix software bugs, but cannot find defects in software design. Big data-based testing can detect defects in software design, but it is difficult to locate and fix software faults.
+III. CHALLENGES
+By collecting and analyzing the relevant data generated by the IoT systems, software faults and software design
+defects can be discovered. Then, we can model software behaviors to simulate the usage scenario of software that
+triggers software faults or displays software design defects. Next, exception execution paths of software are generated
+from the model using model-based testing. Finally, we can instantiate test cases of these paths to reappear software bugs
+TABLE I.
+COMPARISON OF TRAD- ITIONAL SOFTWARE TESTING and design defects in the IoT system. To realize the above METHODS AND BIG DATA BASED TESTING process, there are still some challenges in big data-based
+Item
Traditional software testing methods
Big data-based testing
method
execution of test cases
data collection, analysis and data mining
staff
testers
data analyst
phase in the soft. life cycle
before software release
after software release
software fault detection
yes
yes
testing.
+Challenge 1: How to analyze the data generated by the IoT systems so that valid data can be retained to realize the mining of software bugs and design defects?
+The IoT systems generate massive amount of data every day and most of the data are invalid and redundant [15], which leads to the surge of data storage cost and the difficulty of data analysis [6]. Thus, we need to construct a data filtering model to filter invalid and redundant data. Before adopting the big data analysis technologies, we cannot predict whether there are software bugs or design defects in the IoT system. So, it is an unwise choice to
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+47
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+analyze all the data directly. To solve this problem, researchers put forward the data sampling analysis method [11,12]. The main idea of the proposed method is to first select part of data from the whole data to conduct data analysis. If software faults or software design defects can be found, it indicates that the data filtering model and data analysis method are effective. Then, according to the 2-8 law, we can use the data filtering model and the data analysis method to mine all data. Otherwise, we need to redesign the data filtering model and apply a new data analysis method to deal with the data. Sampling analysis method can be applied to analyze mass data, but the difficulty of applying the method lies in choosing of the right sampling strategy and constructing of the effective data filtering model. In the future, the data sampling strategies and new data filtering models will be two research directions to realize the detection of both software faults and software design defects with the low cost of data analysis.
+Challenge 2: What kind of model can be constructed to simulate the behavioral characteristics of users using the software in a complex scenario?
+Once software faults or software design defects are found, we need to reproduce these faults and defects so that programmers can repair them. However, IoT systems are often used in a very complex application scenario, and there may also be complex interactions between users and systems. Therefore, it is a key for reproducing software faults and software design defects to construct a model to accurately describe the interaction between users and IoT systems. Generally, software behaviors include not only traditional operations such as concatenation, selection, and loop, but also operations such as synchronization, concurrency and alternation between multiple operations [3,16]. Thus, to model complex software behaviors, we need to consider the testability of the selected model so that it is easy to generate test paths from the model and instantiate test cases from test paths [17]. In the past, finite state machine (FSM [18-21]) was usually used to model software behaviors. However, because FSM does not support synchronization and concurrency operations [16], it cannot simulate all software behaviors in IoT systems. To enhance the modeling ability of FSM, extended finite state machine (EFSM [22,23]) and extended regular expression (ERE [16,24,25]) models have been proposed to model software behaviors. These models not only have more powerful modeling capabilities than FSM, but also generate test paths from the models easily. The difficulty in using EFSM and ERE models lies in the lack of modeling tools that can be used in industry. Although a few tools, such as MTTool [2], CREST [23], and SDL [26], were developed to support modeling and test generation for EFSM or ERE, these tools still have shortcomings in the multi-level modeling of large-scale complex systems.
+Challenge 3: How to quickly locate software bugs and design defects in program statements so as to assist programmers in fixing them?
+Model-based testing [21,27-29]can produce the expected execution path and expected result of the software running. Then, we can detect software faults by observing inconsistent between the model and the actual software.
+However, this test method does not involve a single line of code. As a result, it is hard to locate software faults in the program. Combining model-based testing methods and program slicing technology [30,31]may be a way to realize the location of software faults and design defects in the future.
+IV. FUTURE DIRECTION
+Due to the difficulty of simulating the operating environment of the IoT systems exhaustively, it is hard for IoT systems to realize sufficient testing. Through the collection and analysis of data generated from the IoT system, software faults and design defects in the IoT system can be discovered. To realize this purpose, there are still some researches that need to be carried out in the future.
+a) Intent-based data collection method
+The data generated from IoT systems [32]includes: 1) the Web log on the server that records the user's various operations on the software, 2) software error information that is submitted by the user after the software crashes, 3) various operating data of the user to the software, and 4) forum data of the IoT system. Recording all the data will increase the cost of data storage, and a large amount of invalid data will also lead to the failure of big data analysis. In the past, people usually cleaned and formatted those collected big data, and then analyzed them. Therefore, the intention-based data collection method needs to be used to reduce the collected data. To realize the intention-based data collection method, we need to study the classifications of test intent. For example, to find software design defects, we should eliminate those data including standardized operations that follow the software design requirements using a data filtering model because these operations to software have been tested in traditional software testing methods. The defects in software design often come from users’ non-standard operations. Thus, the data including non-standard operations need to be collected in this test intent. In the future, different data collection methods for different test intents, including software design defects, software performance, and software application areas, will need to be studied.
+b) Analysis methods for unstructured data
+Generally, the data that records users’ use of the software are mostly unstructured data, such as log data. To analyze unstructured data, we need to perform field extraction, syntactic analysis, and semantic analysis on the collected data. Therefore, for analysis and research on unstructured data, in the future, there are the three research directions, including massive data incremental sampling analysis method, the extended regular expression modeling method of unstructured data, and the software fault mining method using extended regular expression model.
+Before using big data analysis methods to dig out software faults and software design defects, we can neither predict that the software contains faults or defects, nor predict which data mining methods that will surely detect software faults and software design defects. Aimless data analysis will lead to the increase of the data analysis cost.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+49
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+Thus, it is necessary to screen out the data that can be used to find software faults. An effective data analysis method can discover software faults with the low cost. Currently, the incremental sampling analysis method is an effective data collection strategy with the low cost. In the future, it will be necessary to study the selection strategies of the data, the conditions for terminating data selection, the analytical methods of data characteristics, and the construction method of the data filtering model.
+In the past, to extract information from unstructured data, we used the regular expression to model data features. Then, effective information can be filtered and extracted from the massive data according to this model. Although this method is very effective for the data with obvious features, it is hard for regular expressions to describe those data with complex relationship among data features. Therefore, extended regular expression needs to be studied to solve this problem in the future.
+c) Modeling tool based on regular expression
+After constructing the extended regular expression model for filtering the massive data, we also need to solve a key problem that is a supported tool for modeling extended regular expression. Currently, most of the existing data analysis tools support the processing and analysis of regular expression, but do not support the processing and analysis of extended regular expression. In the future, the modeling theory of extended regular expression and the conversion rules from the model to test paths need to be studied. The difficulty of this research is how to ensure the validity of the transformation from the extended regular expression model to a group of sub regular expression models.
+d) Software behavior modeling
+In the past, to simulate software behaviors, researchers usually need to build models such as FSM, label transition system, and Petri net [32]. However, the relationship between software behaviors in the Internet of things is very complex, such as concurrency and synchronization, which leads to the modeling failure of FSM and label transition system. To model software behaviors in the IoT, it is necessary to clarify the interaction between users and software, such as whether the concurrent operation is between users, how the server responds to these operations, whether the user operation meets the business process and so on.
+e) Software fault location combining model-based testing and program slicing technique
+Through data mining, software faults or software design defects can be found. Then, we can get execution paths using model-based testing for reproducing software faults and design defects in IoT system. To help programmers fixing software faults and design defects, we also need to locate software faults in the program. In the past, programmers usually used program slicing technique to locate software faults. Therefore, how to combine model-based testing and program slicing technique to find software faults is one of the future research directions.
+V. CONCLUSION
+Generally, the IoT system runs in a very complex environment, so it is difficult to realize the complete test of the IoT system in traditional software methods. As a result, it is hard to ensure the reliability of the IoT system by using the way of software testing. To improve the reliability of the IoT system, we recommend big data-based testing. Because the IoT system will produce a large amount of data, including system operation data, user interaction data, sensor data, etc., we can detect potential software faults or software design defects by mining these data. Currently, there are a number of online data sources3,4,5 available to realize software defect detection. This paper discusses the characteristics of big data-based testing, and compares this method with traditional software testing methods. Then, this paper presents the current challenges of big data-based testing, and gives the future research directions of this method. The work in this paper has a very important reference for the promotion and application of big data-based testing.
+REFERENCES
+[1] V. V. G. Neto, "A model-based approach towards the building of trustworthy software-intensive systems-of-systems," in 2017 IEEE/ACM 39th International Conference on Software Engineering Companion (ICSE-C), 2017, pp. 425-428.
+[2] P. Liu and Z. Xu, "MTTool: A Tool for Software Modeling and Test Generation," IEEE Access, vol. 6, pp. 56222-56237, 2018.
+[3] X. Cheng, Y. Wang, W. Zhou, X. Wang, and J. Wang, “Software fault detection for sequencing constraint defects,” International Journal of Performability Engineering, vol. 16, no. 11, pp. 1814–1825, November 2020.
+[4] L. Dawson, "Technological Risks of Space Flights and Human Casualties," in The Politics and Perils of Space Exploration, ed: Springer, 2021, pp. 225-241.
+[5] S. Masuda, K. Ono, T. Yasue, and N. Hosokawa, "A survey of software quality for machine learning applications," in 2018 IEEE International conference on software testing, verification and validation workshops (ICSTW), 2018, pp. 279-284.
+[6] A. Miranskyy, A. Hamou-Lhadj, E. Cialini, and A. Larsson, "Operational-log analysis for big data systems: Challenges and solutions," IEEE Software, vol. 33, pp. 52-59, 2016.
+[7] J.-G. Lou, Q. Fu, S. Yang, Y. Xu, and J. Li, "Mining Invariants from Console Logs for System Problem Detection," in USENIX Annual Technical Conference, 2010, pp. 1-14.
+[8] X. Zhang, Y. Xu, Q. Lin, B. Qiao, H. Zhang, Y. Dang, C. Xie, X. Yang, Q. Cheng, and Z. Li, "Robust log-based anomaly detection on unstable log data," in Proceedings of the 2019 27th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering, 2019, pp. 807-817.
+[9] R. Abbas, Z. Sultan, and S. N. Bhatti, "Comparative analysis of automated load testing tools: Apache jmeter, microsoft visual studio (tfs), loadrunner, siege," in 2017 International Conference on Communication Technologies (ComTech), 2017, pp. 39-44.
+[10] Y.-J. Chen and H.-Y. Chien, "IoT-based green house system with splunk data analysis," in 2017 IEEE 8th International Conference on Awareness Science and Technology (iCAST), 2017, pp. 260-263.
+[11] P. Liu, "Big Data Testing Technology: data collection, analysis, and test practice," Posts and Telecom Press, 2018. (in Chinese)
+3 https://academic.oup.com/nar/article/46/D1/D14/4316108 4 https://sir.csc.ncsu.edu/portal/index.php
+5 https://www.kaggle.com/
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+50
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
+
+[12] X. Wu, X. Zhu, G.-Q. Wu, and W. Ding, "Data mining with big data," IEEE transactions on knowledge and data engineering, vol. 26, pp. 97-107, 2014.
+[13] V. T. Rajlich and K. H. Bennett, "A staged model for the software life cycle," Computer, vol. 33, pp. 66-71, 2000.
+[14] T. R. D. Saputri and S.-W. Lee, "Integrated framework for incorporating sustainability design in software engineering life-cycle: An empirical study," Information and Software Technology, vol. 129,
+p. 106407, 2021.
+[15] M. Gudipati, S. Rao, N. D. Mohan, and N. K. Gajja, "Big data: Testing approach to overcome quality challenges," Big Data: Challenges and Opportunities, vol. 11, pp. 65-72, 2013.
+[16] P. Liu and H. Miao, "Theory of Test Modeling Based on Regular Expressions," in Structured Object-Oriented Formal Language and Method, ed: Springer, 2014, pp. 17-31.
+[17] P. Liu, H.-K. Miao, H.-W. Zeng, and Y. Liu, "FSM-based testing: Theory, method and evaluation," Jisuanji Xuebao(Chinese Journal of Computers), vol. 34, pp. 965-984, 2011.
+[18] A. A. Andrews, J. Offutt, and R. T. Alexander, "Testing Web applications by modeling with FSMs," Software & Systems Modeling, vol. 4, pp. 326-345, 2005.
+[19] W. Li, F. L. Gall, and N. Spaseski, "A Survey on Model-Based Testing Tools for Test Case Generation," in International Conference on Tools and Methods for Program Analysis, 2017, pp. 77-89.
+[20] C. Gaston and D. Seifert, "Model-Based Testing of Reactive Systems. Advanced Lectures, chapter Evaluating coverage based testing," ed: Springer-Verlag, Berlin, 2005.
+[21] P. Liu, Y. Li, and Z. Li, "Some Thoughts on Model-Based Test Optimization," in 2019 IEEE 19th International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2019, pp. 268-274.
+[22] Y. Chen, A. Wang, J. Wang, L. Liu, Y. Song, and Q. Ha, "Automatic Test Transition Paths Generation Approach from EFSM Using State Tree," in 2018 IEEE International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2018, pp. 87-93.
+[23] K. Androutsopoulos, N. Gold, M. Harman, Z. Li, and L. Tratt, "A theoretical and empirical study of EFSM dependence," in 2009 IEEE
+International Conference on Software Maintenance, 2009, pp. 287- 296.
+[24] P. Liu, J. Ai, and Z. J. Xu, "A study for extended regular expression- based testing," in Computer and Information Science (ICIS), 2017 IEEE/ACIS 16th International Conference on, 2017, pp. 821-826.
+[25] O. Kilinccceker, E. Turk, M. Challenger, and F. Belli, "Regular Expression Based Test Sequence Generation for HDL Program Validation," in 2018 IEEE International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2018, pp. 585- 592.
+[26] W. E. Wong, T. Sugeta, J. J. Li, and J. C. Maldonado, "Coverage testing software architectural design in SDL," Computer Networks, vol. 42, pp. 359-374, 2003.
+[27] F. Abbors, T. Ahmad, D. Truscan, and I. Porres, "MBPeT: a model- based performance testing tool," in 2012 Fourth International Conference on Advances in System Testing and Validation Lifecycle, 2012.
+[28] A. Aerts, M. R. Mousavi, and M. Reniers, "A Tool Prototype for Model-Based Testing of Cyber-Physical Systems," vol. 9399, pp. 563-572, 2015.
+[29] M. Markthaler, S. Kriebel, K. S. Salman, T. Greifenberg, S. Hillemacher, B. Rumpe, C. Schulze, A. Wortmann, P. Orth, and J. Richenhagen, "Improving model-based testing in automotive software engineering," in 2018 IEEE/ACM 40th International Conference on Software Engineering: Software Engineering in Practice Track (ICSE-SEIP), 2018, pp. 172-180.
+[30] N. AlAbwaini, A. Aldaaje, T. Jaber, M. Abdallah, and A. Tamimi, "Using Program Slicing to Detect the Dead Code," in 2018 8th International Conference on Computer Science and Information Technology (CSIT), 2018, pp. 230-233.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+51
+Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt b/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt
new file mode 100644
index 0000000..59ee7cf
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt
@@ -0,0 +1,131 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+© 2022 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works.
+SIM-PIPE DryRunner: An approach for testing container-based big data pipelines and generating simulation data
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Aleena Thomas SINTEF AS
+Oslo, Norway
+Aleena.Thomas@sintef.no
+Dumitru Roman SINTEF AS Oslo, Norway
+Dumitru.Roman@sintef.no
+Nikolay Nikolov SINTEF AS
+Oslo, Norway
+Nikolay.Nikolov@sintef.no
+Brian Elves ter SINTEF AS
+Oslo, Norway
+Brian.Elves ter@sintef.no
+Antoine Pultier SINTEF AS
+Oslo, Norway
+Antoine.Pultier@sintef.no
+Ahmet Soylu
+Oslo Metropolitan University Oslo, Norway
+Ahmet.Soylu@oslomet.no
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Abstract—Big data pipelines are becoming increasingly vital in a wide range of data intensive application domains such as digital healthcare, telecommunication, and manufacturing for efficiently processing data. Data pipelines in such domains are complex and dynamic and involve a number of data processing steps that are deployed on heterogeneous computing resources under the realm of the Edge-Cloud paradigm. The processes of testing and simulating big data pipelines on heterogeneous resources need to be able to accurately represent this complexity. However, since big data processing is heavily resource-intensive, it makes testing and simulation based on historical execution data impractical. In this paper, we introduce the SIM-PIPE DryRunner approach – a dry run approach that deploys a big data pipeline step by step in an isolated environment and executes it with sample data; this approach could be used for testing big data pipelines and realising practical simulations using existing simulators.
+Index Terms—Big data pipelines; Dry run; Software contain- ers; Sandbox; Testing; Simulation
+I. INTRODUCTION
+The need for supporting big data pipeline processing is increasing rapidly with more and more applications running on the Cloud and large IoT systems handling huge volumes of data [1]. Big data pipelines are designed to handle large amounts of streaming and batch processing data and are be- coming indispensable in a wide variety of application domains
+[2]. One of the main challenges in managing big data pipelines is analyzing the behaviour of different pipeline steps in order to deploy them in a cost-effective manner. Since deploying computing resources for these pipelines is expensive, it is crucial to adjust the deployment parameters for optimized ex- ecution and to ensure only required resources are provisioned
+[3]. Therefore, one of the key aspects of the big data pipeline lifecycle relates to testing and simulation before deployment in a production setting [4]. Testing refers to executing steps in a pipeline according to its definition,whereas simulation focuses on estimating the performance of the pipeline in the actual
+computing infrastructure by predicting the performance of the pipeline given the execution parameters. An efficient mean of testing and simulating pipelines before deployment allows identifying errors and bottlenecks early and addressing them before provisioning expensive computing resources in the actual production environment on the Cloud-Edge continuum. There are multiple simulation solutions for big data pipelines (e.g., [5]–[7]). One of the main challenges with the simulators is that most of the existing approaches rely on results from previous runs of pipelines or analyses by an expert in order to make predictions [4]. In the case of big data, predicting performance using previous runs is likely to result in high costs if the pipeline is highly computing-intensive. Big data pipelines are complex and dynamic processes built to run on top of a multitude of heterogeneous services and computing resources, which makes prediction of their performance a challenge [2]. To this end, we propose an approach—SIM- PIPE DryRunner—based on dry running of big data pipelines. We describe dry running of big data pipelines as the execution of a pipeline using a sample or smaller input data size (compared to the full-scale big data) on a test environment as opposed to using the infrastructure for production deployment. The overall approach is depicted in Figure 1. We assume that the resource usage metrics for the dry run of the pipeline on a representative set of small input data can be used in the analysis of its behaviour for large amounts of input data. The proposed approach deploys each step in the correct order in an isolated testing environment, hereafter called a sandbox. We use an isolated environment (e.g., a virtual machine) for the dry run, since it can reduce interference from other running applications and ensures better estimates of the performance for the pipelines. The approach enables one to run the pipeline and analyze it in a lower cost environment than simulators, which do additional processing to simulate the actual computing environment like the Cloud or Edge
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+This is the author accepted version of an article published in
+2022 IEEE 46th Annual Computers, Software, and Applications Conference (COMPSAC) https://doi.org/10.1109/COMPSAC54236.2022.00182
+
+Fig. 1. Dry run approach for testing and simulating big data pipelines.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+where it will be deployed in production. The approach, firstly, could be used to check the correctness of the pipeline and to ensure that the pipeline is working as expected and producing the expected output. Secondly, dry run results can be used in simulators to aid in predicting the performance of the pipeline and identify possible bottlenecks. Thereby, the dry run result of the pipeline for a small data size may be used to predict the performance for bigger data sizes, assuming that the data are processed in chunks/slices. For example, metrics collected by dry running with different chunk sizes can be used to estimate infrastructure resources required for scaling the pipeline (e.g, CPU, memory and disk size, and using multiple processes). Software container technologies could simplify the execution of data pipelines [8] both in isolated and production envi- ronments by encapsulating individual data pipeline steps in platform and programming language independent containers. In this paper, we describe the proposed dry run approach and present a tool—the SIM-PIPE DryRunner tool—implementing the approach. The overall SIM-PIPE solution aims at using the dry run results for testing the pipelines and simulating them using existing simulators.
+The rest of the paper is organized as follows. Section II provides the description of our approach as well as the technical architecture and implementation. In Section III, we present a use case for the proposed approach, while Section IV presents related work. In Section V, we summarize our approach and provide directions for future work.
+II. SIM-PIPE DRYRUNNER APPROACH
+The proposed approach based on dry running of big data pipelines relies on the use of an isolated sandbox environment to execute pipeline steps. By maintaining an isolated testing environment, we are able to get an estimate of the resource usage of each step without interference from other running processes. Moreover, the container-based implementation of the step facilitates accurate estimation of its total execution time in the actual deployment infrastructure. This is due to the homogeneity of container technologies, which ensures that the execution of the container is reproducible regardless of the computing infrastructure in which it is executed. Thus, by running the container-based implementations of the pipeline steps, we ensure that we obtain values from dry run, which
+can be used to predict how the pipeline behaves on resources on the Cloud-Edge continuum.
+Figure 2 shows the main steps of the dry run process. Once a dry run is initiated, a step in the pipeline and sample data are deployed to the sandbox using a container. During the execution of the step, execution time will be recorded and the sandbox will be continuously pooled for metrics about the execution. These metrics are stored for later use. Once the step has successfully performed the data processing task, the resulting data will be retrieved, the running step will be removed from the sandbox, and the same process will be repeated for the next steps (i.e., deploy the step and feed it with the resulting data from the previous one). Based on the data gathered, analytics will be performed to derive results that apply to the entire pipeline. The pipeline steps, in case of steps performing batch processing, are provided with a sample input to be used during the dry run. In case of steps which perform continuous processing, there is a user definedoption to provide the number of seconds to wait before the step is terminated, this ensures that the correctness of the step and recording of resource usage metrics can be done for that specified amount of time. All the details including resource usage statistics, inputs to the steps, and outputs of the execution are stored and eventually used to perform resource usage analytics.
+In the following we describes the technical architecture and implementation of the SIM-PIPE DryRunner tool, and outline
+a typical use of the tool.
+A. Technical Architecture and Implementation
+In order to demonstrate the feasibility of the approach for dry running of big data pipelines, we designed and imple- mented a prototype application—the SIM-PIPE DryRunner tool. It consists of several components that are deployed sepa- rately in order to ensure an appropriate execution environment for the dry run approach. The current version of the tool, along with installation instructions are available on GitHub1.
+Figure 3 shows the deployment topology and architecture for SIM-PIPE DryRunner tool. The tool is designed to be de- ployed in two separate hosts: one for hosting the front-end and business logic, and one for hosting the sandbox environment. The main component is the dry run controller, which performs a step-wise analysis of the pipeline by deploying steps and
+1https://github.com/DataCloud-project/SIM-PIPE
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+
+Fig. 2. The SIM-PIPE DryRunner process for testing and collecting performance data.
+
+Fig. 3. SIM-PIPE DryRunner tool: deployment topology and architecture.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+collecting relevant data. Host 1 in Figure 2 contains the dry run controller and REST service (which serves the front-end of the implementation) as well as the dry run data storage, which is implemented using TimescaleDB2. In our implementation, these sub-components are deployed on the host using Docker containers. The necessary files for providing the input and storing the output of each step are transmitted and stored using an SFTP server which also runs in a Docker container in host 2. When deploying a step to be analyzed, the dry run controller sends (if needed) data over SFTP to the sandbox host, which makes it available to the container and executes the step.
+The dry run controller and REST service are implemented using NodeJS3 and use a number of NodeJS libraries related to
+2https://www.timescale.com 3https://nodejs.org
+managing the execution of containers on a target host, namely dockerode4 for container execution control in the sandbox and ssh2-sftp-client5 for interacting with the SFTP server on the sandbox. The REST API is developed using GraphQL6 (a query language for APIs). Hasura7 is used to develop and
+connect to the data model of the dry run data storage. The front-end of the SIM-PIPE DryRunner tool is implemented using Appsmith8.
+The current version of the SIM-PIPE DryRunner tool user interface is depicted in Figure 4. The interface displays a list of
+4https://github.com/apocas/dockerode 5https://github.com/theophilusx/ssh2-sftp-client 6https://graphql.org
+7https://hasura.io
+8https://www.appsmith.com
+dry runs tied with a specific pipeline as well as the associated runs to each dry run. For each run, it displays the run state (“Waiting”, “Queued”, “Active”, “Completed”, “Failed”, or “Cancelled”) as well as statistics on each of the steps. The statistics include the used CPU, memory, network, and running time. In addition to the statistics, the current version of the user interface displays logs from the execution of the steps. The tool assumes that the pipeline description is provided in the form of a Domain Specific Language (DSL) which is described in a Github repository9. This DSL has been developed as part of the DEF-PIPE tool which is a GUI (Graphical user Interface) based tool to design, implement and store big data pipelines. More details and usage guidelines of this tool are given in a Github repository10.
+The current implementation supports explicitly step imple- mentations as described in the big data pipeline approach in [9], whereby each container collects input data, stores output data, and any intermediate data separately in a file system. Thereby, the SIM-PIPE DryRunner tool provides input data to the steps and stores intermediate step outputs for analysing the dry run. Other step implementations that do not use file-based data transmission are also applicable, but the data delivery system currently does not support this.
+The dry run data storage uses a relational database model and records each dry run with a timestamp and pipeline identifier. Each run is also associated with the DSL model that was used when the run was started as well as its (current) status and the timestamps when the run was created, started, and ended. Each run stores data for each of the steps that are in the input DSL model with the step name, status, and metrics about the used CPU and memory. Intermediate data are stored on disk in a file system that are marked with the pipeline identifier, run identifier, and step number and can be served on request to the front-end.
+B. Using the SIM-PIPE DryRunner tool
+Dry run using the SIM-PIPE DryRunner tool is done through the following steps:
+• First, the user creates a new dry run for a pipeline by providing its DSL description and sample input data using the SIM-PIPE DryRunner tool UI.
+• The user starts a new dry run and the current status of the run and each step is displayed in the UI.
+• After each step has completed execution indicated by its status, the user can click on the step to view the logs generated during execution, CPU usage percentage, network usage, memory usage and maximum memory usage over time.
+• In case of failure of a step, the status of the step and correspondingly run would indicate failure status, and only the logs would be displayed which may help in debugging.
+9https://github.com/DataCloud-project/DEF-PIPE-DSL 10https://github.com/DataCloud-project/DEF-PIPE
+• The step can also be stopped while running, and this stops the current step and all the succeeding steps in the pipeline.
+III. USE CASE
+The SIM-PIPE DryRunner tool was tested on data pipelines in the context of a digital health system, where developers and data engineers are using data pipelines to implement different e-health services. The main objective of the digital health sys- tem is to monitor, support and help patients, especially elderly, at their homes, remotely. The system uses data pipelines to gather sensor data (e.g., welfare sensors and medical devices) from the patients, store and process the patient data, and provide relevant data to the right stakeholder at the right time (e.g., notifications of events to healthcare providers, storing data in electronic health records, and providing data and notifications to third party health systems).
+Figure 5 illustrates a generic digital health data pipeline that involves three steps: 1) Data generation, pre-processing and routing, 2) Data storage and analysis, and 3) End user application logic. The first step is deployed on the Edge, while the two latter are deployed on the Cloud. The steps are the same three steps shown in the SIM-PIPE DryRunner tool UI in Figure 4. The first step involves collecting and formatting sensor data from healthcare sensors and medical devices that the patient uses. The second step involves storing the data and checking it against the patient plan. The third step involves different types of end user application logic, such as notifying healthcare providers and submitting reports to 3rd party healthcare systems.
+Several instances and variants of data pipelines are deployed in the digital health use case. There are pipeline instances for each patient. Some of the challenges in managing the various variants of pipelines relates to i) scaling individual steps of the pipeline, ii) the need to build new applications for each new type of sensor, and iii) finding the optimal resource allocation for data processing steps. The SIM-PIPE DryRunner tool is used to address these challenges, allowing the developers and data engineers of the digital health data pipelines to test new variants of the pipelines without deployment on production infrastructure in order to identify trouble spots and bottlenecks early, as well as better understand the resource requirements required from the metrics collected by the SIM- PIPE DryRunner tool.
+IV. RELATED WORK
+There are several simulation approaches for data pipelines that include tools to simulate big data pipelines, such as the event-based simulator GroudSim [5], and process-based simulators GridSim [6] and CloudSim [7]. Despite the number of simulation approaches in literature, there are few that can be used for testing and simulation of big data pipelines. Liu et al. [10] present a survey of scientific workflow management systems in the context of big data pipelines, out of the five
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+
+Fig. 4. SIM-PIPE DryRunner tool front-end.
+
+Fig. 5. SIM-PIPE DryRunner tool front-end.
+systems presented only two of them (Tavernahttps://incubator.apache.org/projects/taverna.html
+, Swifthttps://github.com/square/workflow-swift
+) had a system for container-based big data pipelines and supports simulation or testing component. While Taverna is specialized design, composition, configuration, orchestration, enactment, to support bio-informatics pipelines, Swift only provides tools and validation of end-to-end big data analytic services. Each for unit and integration testing of pipelines. These simulators step in the input pipeline is provided in the form of one of vary in ways in which they accept data for simulating a the four predefined containerized application images (named pipeline. Many of them run pipelines multiple times and the as Apps) which is part of their microservices architecture. results from the runs are used in simulation [11]. Though it handles several types of big data workflows, it is
+Iatropoulou et al. [12] present a data pipeline management not open source and thus cannot be extended.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+V. CONCLUSIONS AND OUTLOOK
+We proposed a new approach—SIM-PIPE DryRunner—for dry running of big data pipelines using an isolated sandbox for deployment of steps. Testing and simulation of big data pipelines is challenging, since the existing methods depend on information from previous runs or domain expert knowledge, which are difficult to acquire in case of big data pipelines. We also developed an initial version of the tool—the SIM-PIPE DryRunner tool—with a user interface in which the pipeline designer can input and dry run big data pipelines and view the results of the resource usage of step execution and logs. The dry run results of the big data pipeline can be used in existing simulators by bringing them into the respective format that can be used as input. One limitation of this method is that it assumes that the big data pipelines have container-based implementations.
+In the future, we aim to enable the SIM-PIPE DryRunner tool to recommend minimum requirements for the resources necessary to run the pipeline steps successfully (i.e., the minimum memory and CPU requirements) and to provide an estimation of the optimal horizontal scaling for each individual step that will allow for executing the pipeline without bottlenecks. Future work also involves extending it further by integrating advanced analytics for the results obtained from the sandbox. This involves predicting the resource usage performance and total execution time of the pipeline when a given input size is specified. We also aim to analyze and quantify the impact of parallelisms for various pipeline steps. This can be used in configuring the resources at deployment or in scheduling algorithms. Finally, we also plan to use the dry run results in existing simulators. This requires investigation of input formats which is accepted by these simulators and conversion of the output of our tool into a format that is usable by them.
+Acknowledgements. This work received partial funding from the European Commission Horizon 2020 DataCloud project (grant number 101016835), the NFR BigDataMine project (grant number 309691), and the SINTEF internally funded SEP DataPipes project.
+REFERENCES
+[1] R. Buyya, S. N. Srirama, G. Casale, R. Calheiros, Y. Simmhan,
+B. Varghese, E. Gelenbe, B. Javadi, L. M. Vaquero, M. A. S. Netto,
+A. N. Toosi, M. A. Rodriguez, I. M. Llorente, S. D. C. D. Vimercati,
+P. Samarati, D. Milojicic, C. Varela, R. Bahsoon, M. D. D. Assuncao,
+O. Rana, W. Zhou, H. Jin, W. Gentzsch, A. Y. Zomaya, and H. Shen, “A manifesto for future generation cloud computing: Research directions for the next decade,” ACM Computing Surveys, vol. 51, no. 5, 2018.
+[2] M. Barika, S. Garg, A. Y. Zomaya, L. Wang, A. V. Moorsel, and
+R. Ranjan, “Orchestrating big data analysis workflows in the cloud: Research challenges, survey, and future directions,” ACM Computing Surveys, vol. 52, no. 5, 2019.
+[3] A. Shakarami, H. Shakarami, M. Ghobaei-Arani, E. Nikougoftar, and
+R. Faraji-Mehmandar, “Resource provisioning in edge/fog computing: A comprehensive and systematic review,” Journal of Systems Architecture, vol. 122, p. 102362, 2022.
+[4] I. Bambrik, “A survey on cloud computing simulation and modeling,” SN Computer Science, vol. 1, no. 5, p. 249, 2020.
+[5] S. Ostermann, K. Plankensteiner, R. Prodan, and T. Fahringer, “Groudsim: An event-based simulation framework for computational grids and clouds,” in Proceedings of the Euro-Par Parallel Processing Workshops (Euro-Par 2020), ser. LNCS, vol. 6586. Springer, 2010, pp. 305–313.
+[6] R. Buyya and M. Murshed, “Gridsim: A toolkit for the modeling and simulation of distributed resource management and scheduling for grid computing,” Concurrency and computation: practice and experience , vol. 14, no. 13-15, pp. 1175–1220, 2002.
+[7] R. N. Calheiros, R. Ranjan, A. Beloglazov, C. A. De Rose, and R. Buyya, “Cloudsim: a toolkit for modeling and simulation of cloud computing environments and evaluation of resource provisioning algorithms,” Soft- ware: Practice and experience, vol. 41, no. 1, pp. 23–50, 2011.
+[8] M. Matskin, S. Tahmasebi, A. Layegh, A. H. Payberah, A. Thomas,
+R. Nikolov, and D. Roman, “A survey of big data pipeline orchestration tools from the perspective of the datacloud project,” vol. 3036, 2021.
+[9] N. Nikolov, Y. D. Dessalk, A. Q. Khan, A. Soylu, M. Matskin, A. H. Payberah, and D. Roman, “Conceptualization and scalable execution of big data workflows using domain-specific languages and software containers,” Internet of Things, vol. 16, p. 100440, 2021.
+[10] J. Liu, S. Lu, and D. Che, “A survey of modern scientific workflow scheduling algorithms and systems in the era of big data,” in Proceedings of the IEEE International Conference on Services Computing (SCC 2020). IEEE, 2020, pp. 132–141.
+[11] T.-P. Pham, J. J. Durillo, and T. Fahringer, “Predicting workflow task execution time in the cloud using a two-stage machine learning approach,” IEEE Transactions on Cloud Computing, vol. 8, no. 1, pp. 256–268, 2017.
+[12] S. Iatropoulou, P. Petrou, S. Karagiorgou, and D. Alexandrou, “Towards platform-agnostic and autonomous orchestration of big data services,” in Proceedings of the IEEE Seventh International Conference on Big Data Computing Service and Applications (BigDataService 2021). IEEE, 2021, pp. 1–8.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt b/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt
new file mode 100644
index 0000000..a47daf1
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt
@@ -0,0 +1,58 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt b/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt
new file mode 100644
index 0000000..6945d9f
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt
@@ -0,0 +1,141 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Analysis on the Quality Model of Big Data Software
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+Xijiao Xu
+Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center
+Shanghai, China xxj@sscenter.sh.cn
+ Jiayu Gong
+Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center
+Shanghai, China gjy@sscenter.sh.cn
+ Huanming He
+Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center
+Shanghai, China hhm@sscenter.sh.cn
+Wei Song
+Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center
+Shanghai, China songw@sscenter.sh.cn
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+Abstract—With the rapid development of the big data system, The big data system has the characteristics of large data scale, diverse data and high computational complexity. Its testing method has to be constantly improved. By analyzing the general software quality model, and combining the characteristics of the big data software, a set of quality model for the big data software is formed.
+Keywords—Big Data ,the Quality Requirements ,Software Model
+I. INTRODUCTION
+The rapid development of the Internet has given birth to a large number of new frontier technologies. The big data is a hot emerging industry in recent years. The Internet has created a large-scale application environment for the big data technology, which first originated from the Internet. The Internet provides the most important data foundation for the big data. The analyzing and processing capabilities of the big data also bring more developing possibilities for the Internet
+companies. In this article,The big data system is defined to centrally store big data resources, meet the high concurrency, mass data requirements for high-performance computing and large-capacity storage capabilities, and provide the ability of the data collection, The big data systems defined in this article is to centrally store big data resources, meet the high concurrency, mass data requirements for high-performance computing and large-capacity storage capabilities, and provide a large amount of openness such as data collection, data calculation, data storage, data analysis, and data visualization. Ability, the data calculation, the data storage, the data analysis, and the data visualization.
+As a new application technology, the big data system carries the core business of the platform frequently, so the comprehensive testing and evaluating of the big data system is particularly important. However, due to the characteristics of the big data, its testing methods are different from the traditional software test. The evaluated model of the general software quality ,which is used in the big data system, cannot reflect the characteristics of the big data system such as large data scale, diverse data, high computational complexity, and
+distributed structure. This paper will establish a set of software quality model for the big data system to provide reference for the test and evaluation of the big data system, from the perspective of software quality evaluation model and combining with the big data system evaluated examples.
+II. THE EVALUATED MODEL OF THE SOFTWARE PRODUCT QUALITY MODEL
+Software products have different quality requirements from the perspective of different users. Users consider that the software is easy to use, easy to learn, flexible and user-friendly as the high-quality software. Product managers consider that the software is easy to maintaining, easy to modifying, and easy to developing because of thinking about the product marketing competitiveness. Developers usually consider the software’s complexity and importance as the important indicators of the software quality. So it has great significance to establishing the software quality standard, which is beneficial to improving the product’s software quality.
+At present, the general software quality standards widely used and recognized in the industry are ISO/IEC 25023:2016[1~2]. The software products’ quality evaluated model includes ISO/IEC 25051 software quality model[3]. In this model, the software quality characteristics are defined as functional suitability, performance efficiency, compatibility, usability, reliability, security, maintain-ability and portability. These quality characteristics can be used as the general software quality metrics, but the quality of the big data system cannot be measured.
+The difference between the big data systems and the traditional systems is storage, mainly about the database storage and the file storage. The searching engine companies were the first to feeling the technical challenges of the massive amounts of data. Subsequently, the rise of the social media sites and the mobile Internet aggravated this challenge. The Internet companies find that the growth, the diversity, and the processing timeliness requirements of the new data cannot be dealt with by the traditional databases and business intelligent vertical scaling architectures. Because the traditional database is designed to capturing data, if you directly get data from it for analysis, there will be many problems, such as complex
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+ This work was supported by National Key R&D Program of China (No. 2018YFB1403404).
+978-1-6654-1893--5/21/$31.00 ©2021 IEEE 78
+ICIS 2021-summer, June 23-25, 2021, Shanghai, China
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+structure, messy data, missing history, slow query when the amount of data is large, etc. At this time, you need a "data warehouse ". As a result, the distributed file system—— Google File System (GFS) was first proposed, the distributed computing system and the distributed database solved the predicament faced by the big data with the lower cost and laid the foundation for the flourishing of big data technologies such as HBase, Cassandra, MongoDB, Neo4j and Redis and other databases. The computing processing engine gradually covers scenarios such as offline batch computing, real-time computing, stream computing, and the computing frameworks of MapReduce, Spark, Flink, and Storm are born. In the field of data query and analysis, it has formed a wealth of SQL on Hadoop solutions, massively parallel processing (MPP) architecture, Hive, HDFS, MR, TeraData, GreenPlum and other technologies. The universal system frame diagram of applying big data technology is shown in Figure 1, which contains the common components of the big data system.
+
+Fig. 1. The system frame diagram for Big Data System
+Therefore, according to the characteristics of the big data system, it is necessary to provide more quality measures for its software quality model, and comply with the following principles[4]:
+1) Performance efficiency should consider the processing speed, the response time, the resource consumption, throughput, etc. The general performance testing tools are not suitable for the big data system’s measurement, and there are many types of modules in the big data system, also the different modules require the different testing techniques, so multiple testing tools are frequently needed.
+2) The testing environment and monitoring plan of the big data system should be considered. The testing environment of the big data system is complex, and
+the factors that affect the performance of the big data system are numerous and complicated, including network environment, application, virtualization, data quality, etc., so it is necessary to monitor the entire Cluster machines, services, computing, storage, tasks and other information.
+3) The measurability of the quality characteristics should be considered. It should be measured by subjective and objective means, and the cost of measurement should be taken into account. It should be easy to measure and convenient for data collection. The data processed by the big data system has the characteristics of large-scale (Volume), various types (Variety), and fast production speed (Velocity). In the test process of the big data system, the more realistic the test data set is, the more reliable the test results will be.
+III. THE EVALUATED MODEL OF THE BIG DATA SOFTWARE QUALITY
+Based on the above evaluation principles, and combined with the ISO/IEC 25051 software quality model, a three-tier structure framework is formulated for the test quality evaluated model of the big data system, as shown in Figure 2. In this framework model, the quality factor layer is the eight quality characteristics of the software quality model; the quality sub- elements are the refinement of its upper quality factor layer; the bottom layer is the software quality metric (including various parameters), which is a quantitative software characteristic indicators. For example, the resource consumption mentioned in the article is the software quality metric of resource availability which is attributed to performance efficiency.
+
+
+
+Metric Metric Metric Metric Metric Metric
+Fig. 2. Quality Evaluated Model
+A. Functional Suitability
+The functional sub-characteristics of the big data system mainly include data collection, data storage, data analysis, etc. For the big data system, it mainly measures its data analysis and processing function modules, namely data tables or data files. The specific measurement elements include[5-7]:
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+79
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+(1)Verify completed data table, and the table name is consistent with the agreement;
+(2) that data table fields are complete, field name, field
+type, length precision and other attributes are consistent with the convention;
+(3)The primary key of the data table set consistent with the agreement, and the technical constraints are that there are no records with duplicate primary keys and no records with null
+primary key fields;
+(4) Verify that the time constraint is consistent with the
+convention.
+data processed by each Executor and the processing time can be viewed by accessing Spark's Web UI interface. The Spark's Web UI interface is shown in Figure 3.
+
+Fig. 3. The Spark's Web UI Interface
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+B. Performance Efficiency C. Compatibility
+Compatibility mainly includes co-existence, verifTy he thsube -plchaatfroracmte risctiomcs poofne pntersf orofm athncee ebiffgic idaenctay smyasintelym interoperability and other aspects. Among them,
+including HDFS, HBASE, SPARK, Cloudera and so on. Under interoperability is to evaluate the ability of information transfer each sub-characteristics, the performance testing elements of and interaction between two or more modules. In the big data the big data system mainly include: throughput, data system framework, data providers introduce new data or processing, query response time, etc. The components and information into the big data system; data consumers use metrics is shown in Table 1. applications provided by the big data application providers. There are rich interfaces among the data providers, the data
+Table 1 Components and Metrics consumers and the big data application providers, such as the data access interface, the data acquisition interface, the data
+Components
Metrics
HDFS
Throught(Read and Write Performance)
HBASE
Data processing(Read and Write Requests/per second)
SPARK
Data processing
Cloudera
The Monitoring Component of Hadoop Platform
verification interface, etc.[8]. It requires these interactive interfaces to follow the rules of big data collection and
+retention, data access in multiple formats (structured, semi- structured, unstructured), and support for common data
+collected tools.
+D. Usability
+Usability mainly includes learnability, user error protection and so on. The measurement of learnability includes consideration of whether the software presentation documents or the software system helping documents are easy to operate, comfortable and effective. And according to the file, whether the big data system can be easily deployed, or a graphical interface system of the configured tool is provided. User error protection considers whether the system prompts the delete operation when the product software performs the delete operation.
+Throughput: Platform IO processing capability is suitable
+for HDFS, Hbase and other technologies. The involved tools of E. Reliability
+performance analysis include the TestDFSIO tool that comes
+with Hadoop and the performance testing tool Yahoo! Reliability mainly includes availability, fault tolerance, CloudServing Benchmark (YCSB), etc.; the database IO easy recovery and so on. For the big data system, under the processing capabilities, such as MPP database, can include above sub-features, the main measured elements are system sequential table scan single node performance, single node data redundancy and data backup strategy.
+import and export, and accurate query of tens of billions of System redundancy:Check whether the number of tables. sub-nodes of HDFS, HBase, and MPP components of
+Data processing: including the speed of executing queries the big data system is redundant.
+or MapReduce jobs, as well as the computing power of the Data backup strategy: Check the number of copies of platform. For example: the spark computing power mainly uses HDFS data‘s settings, HBase, MPP databases’ data aggregate query and Terasort algorithm as performance backup strategy.
+evaluated standards. Aggregate query is the task of submitting
+aggregate query in Spark cluster, and you can view the amount F. Security
+of data processed by each Executor and the processing time by The sub-characteristics of information security mainly visiting the Spark's Web UI interface; Terasort algorithm include confidentiality, non-repudiation, authenticity, evaluation is also in the Spark cluster. By running the TeraSort data security etc.
+tool, the generated random data is sorted, and the amount of
+80
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+ Confidentiality:User access rights of the big data
+system includes the configuration of roles and users in the unit of system components, according to the granularity of data table level and data column level to assign permissions to users;
+ Non-repudiation: the operation log of the big data
+system cannot be modified or deleted;
+ Authenticity : identity authentication mechanism,
+check the identity authentication method, password complexity requirements and login of users by the big data system.
+ Data Security:check whether the system provides
+data storage encrypted and decrypted functions; sensitive data is encrypted transported.
+G. Maintain-ability
+Maintainability mainly includes analyzability and modifiability. The analyzability’s elements are to confirm the installation and deployment of the big data cluster nodes and the data nodes, and to view the version information of the system. Modifiability is mainly to check the system's online upgrade function and data update mode.
+H. Portability
+The sub-characteristics of portability includes adaptability and installability. The adaptability’s metric is to confirm the operating system, database, browser that the big data system is adapted to. Installability is mainly check whether the managing node and data node of the big data cluster can be installed.
+suitable for big data system , compared with the general software quality model for analysis. It is hoped to provide reference for the big data platform test and improve the quality of the big data software.
+REFERENCES
+[1] ISO/IEC 25010:2011 “System and software engineering—Systems and software quality requirements and evaluation(SQuaRE) Part 10: System and software quality models”;
+[2] ISO/IEC 25023:2016“ Systems and software engineering—Systems and software Qualitu Requirements and Evaluation(SQuaRE)- Measurement of system and software product quality” ;
+[3] ISO/IEC 25051:2014 “System and software engineering——Systems and software quality requirements and evaluation(SQuaRE) Part 51:Requirements for quality of ready to use software product (RUSP) and instructions for testing”;
+[4] Yuyu Yuan. Practical quality model for evaluating software products. Computer Engineering, 29(5):32-34, 2003;
+[5] GB/T 38673—2020 “Informantion technology ——Big data——basic requirements for big data systems(Chinese)” ;
+[6] ISO/IEC 25024:2015 “Systems and software engineering — Systems and software Quality Requirements and Evaluation (SQuaRE) — Measurement of data quality”;
+[7] ISO/IEC 25012:2008 “ Software engineering — Software product Quality Requirements and Evaluation (SQuaRE) — Data quality model” ;
+[8] GB/T 38672—2020“Information technology ——Big data——Interface basic requirements(Chinese)”.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
+
+IV. CONCLUSION
+By analyzing the characteristics of big data software, this paper has formed a set of software quality requirements system
+81
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt b/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt
new file mode 100644
index 0000000..b2ba614
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt
@@ -0,0 +1,196 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Hindawi
+Mobile Information Systems
+Volume 2022, Article ID 4339456, 12 pages https://doi.org/10.1155/2022/4339456
+Research Article
+Regulatory Mechanism of Financial Market Resource Management Driven by Big Data
+Wangsong Xie 1 and Jianjun Cao2
+1Business School, Wuxi Taihu University, Wuxi 214064, Jiangsu, China
+2Human Resources Department, Wuxi Taihu University, Wuxi 214064, Jiangsu, China
+Correspondence should be addressed to Wangsong Xie; xiewangsong@126.com
+Received 15 April 2022; Revised 31 May 2022; Accepted 23 June 2022; Published 30 July 2022 Academic Editor: YangGao
+Copyright © 2022 Wangsong Xie and Jianjun Cao. is is an open access article distributed under the Creative Commons AttributionLicense, which permitsunrestricteduse, distribution, andreproductioninanymedium, providedthe originalworkis properly cited.
+In order to further understand the current situation of the financialmarket and better supervise the resource management of the financialmarket, combined with big data and cloud computing technology, through the construction of big data cloud platform resource management system and the integration of various technical computing frameworks, we can realize the effective supervision of big data resources in the financial market. Using J2EE technology, this paper analyzes, designs, implements, and tests the investment data management system, analyzes the content of the software engineering subject, and obtains the demand function description of the business. According to the software development process and the actual situation of enterprise investment, this paper expounds the basic requirements of the investment data management business, system architecture requirements, user use case status, and the operation and configurationenvironment of the investment data management system.
+ ispaperanalyzesthetechnicalcharacteristicsandoperationindicatorsofthesoftware,andestablishesthedataflowforthedata related to investment data management, such as information statistics, data query, information classification and so on. Finally, thesystem isverified,operatedand tested,and thebusiness usecases andparameters ofthesystem aretestedaccordingtothetwo indicators of software testing. e basic functions of the investment data management realized by the system are correct, the design is reasonable, the operation is stable, the operation response time is short, the operation accuracy is high, and the data access efficiency is good.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+1. Introduction
+Today, with the advent of the information network era, data and information are becoming more and more important, especially for all areas of life. e understanding of big data directly affectsthe development of an enterprise or industry. With the advancement of communication and dataization, the integration of financeand big data industries in the new economic era is crucial. e emergence and continuous improvement of big data can increase the transparency of financialmarkets. With the help of new technologies such as big data and cloud computing, financial services can dis- cover more important and useable data from big data and enhance this data to promote the health of the financial system. At the same time, big data can support research on Internet business management and financial markets, help
+financial markets achieve greater influence, better avoid business risks, and improve the performance of financial service businesses [1]. However, with the continuous in- crease of financial market resources, especially the fact that more and more idle funds of the public are handed over to financial institutions for asset management, the supervision of financial institutions is becoming more and more im- portant. Under the dual influence of internal and external regulatory policies and regulators, the financial market urgently needs to strengthen the construction of resource management and supervision mechanism, as shown in Figure 1. Based on this, the article combines big data and cloudChinatechnologytoachievebettermanagementofbig data in the finance industry and maintain multi-inclusive management and integration by creating a big data cloud platform experience. At present, the research and discussion
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+2
+A collection of Portfolio
+investment Fund manager investment
+Investors a Securities a Investors b Fund Securities b
+Investor’s c Fund trustee Securities c Figure 1: Financial market resource management.
+mainly focus on restricting the investment of asset man- agement business in nonstandard business. e system re- cently introduced at the regulatory level also reflects the opinions and clear attitude of standardizing nonstandard asset investment [2]. At present, the development trend of the financial industry is mixed operation and financial in- novation. Nonstandard assets have played an important role in activating the financial market, enriching financial in- struments and serving the investment and financing of the real economy. e return to simplicity can only be relative, and the return to simplicity of financial derivatives is completely inconsistent with the reality of development.
+2. Literature Review
+Huanget al.[3] studied theinvestment system of enterprises and made some achievements in the research process [3]. Sultanaw et al. [4] put forward the theory of “reference design model” for the investment management system in South Korea. e theory adopts a strategic way to sort and manage the investment information, and handles the in- formation security problems in the task of the management system through effectivemeans. It forms a unique theory for the actual investment management system [4]; Phi- boonbanakit and Horanont [5] solved the demand analysis of investment management system, improved the quality of system analysis report from the aspect of reliability, com- bined analysts and business personnel, and eliminated some obstacles between them [5]. Qu [6] believed that the essence of the model is based on the “cooperation mechanism.” Process capital analysis can solve existing problems and solve problems in investment management level assessment from the perspective of cooperation and collaboration [6]. Yan et al. [7] said thatthe investment management system is carried out around services, through high-quality services, shaping and strengthening a good public image of invest- ment, creating a favorable public opinion environment, striving for favorable investment policies, and finally real- izing the long-term development of investment manage- ment [7]. Watson et al. [8] believed that the investment management platform, as an important part of digital
+Mobile Information Systems
+investment, is a scientific management guarantee for real- izing investment, involving all links and multi-level com- prehensive application of investment management. e investment management system with scientificmanagement asthecore,effectivelysupportstheimplementationofdigital enterprises, improves the management efficiency of enter- prise parks, and becomes an irreplaceable platform for in- vestment management of enterprises [8]. Hyers [9] said that for capitalist countries, the main goal of market supervision is simple and clear, that is, to maintain market order by relying on mandatory laws, systems and norms, and its market supervision behavior is controlled by the nature of capitalism. erefore, with the development of capitalist market and the change of government functions, there are various studies on market supervision [9]. For example, Connolly Barker et al. [10] believed that market regulation is the comprehensive control of various factors in the market by the government in order to ensure social stability and sustainable economic development, to standardize market behavior, and to ensure orderly operation of the market and maintain stable economic development [10]. Keane et al. [11] said that market regulation is a passive government behavior. Since the market cannot spontaneously maintain good order, the government needs to participate in regu- lation. erefore, market regulation must have mandatory elements. With the continuous development of the market, the market supervision implemented by the government must achieve dynamic follow-up, that is, the government supervision can meet the needs of market development [11]. Guan et al. [12] believed that if the market supervision implemented by the government cannot meet the needs of the current market, it will lead to the lack of supervision in some supervision and many problems; although the gov- ernment’s market supervision comprehensively includes market factors, if the supervision is too frequent, or even the supervision strength exceeds the market bearing capacity, it will restrict the benign self-development of the market to a certain extent [12]. Maddumala et al. [13] said that the characteristic of market supervision is that functional de- partments not only supervise in accordance with relevant lawsandregulations,butalsomanageallaspectsandlinksin the market. Due to the characteristics of socialist economy, the government also supervises its own market behavior to comprehensivelyensurethestabilityandorderofthemarket [13].
+Based on this research, this paper proposes a regulatory mechanism based on big-data-driven financial market re- sourcemanagement.Inthispaper,usingtheJ2EEtechnique, analyzed, designed, implemented, and tested the investment data management system, to analyze the content of the software engineering project, get the business requirements function description, based on the software development process, according to the actual situation of enterprise in- vestment, the basic requirements of the investment data management business, the system architecture require- ments, the status of the user use case are expounded. For the operation and configurationenvironment of the investment data management system, the technical characteristics and operation indexes of the software are analyzed, and the data
+Mobile Information Systems
+related to investment data management, established the data process, such as information statistics, data query, infor- mation classification, and other contents, at last, verify the running and tested the system, according to the two aspects of the software testing indicators, service case and param- eters of the test system. e basic functions of the system are correct, with reasonable design, stable operation, short operation response time, high operation accuracy, and good data access efficiency. e test results show that the in- vestment data management system of the investment en- terprise operates normally, and the various operating parameters of the software meet the design requirements and software engineering standards.
+3. Design of Supervision Platform for Financial Market Resource Management
+3.1. System Functional Requirements.According to the construction objectives, the basic functions of the invest- ment data management platform are shown in Figure 2 below.
+(1) Design the enterprise basic information manage- ment module, the main functions are: manage the basic situation of the enterprise, list statistics of subordinate enterprises, and manage the basic business of the enterprise;
+(2) Management and investment project information module: manage high-risk financial investment projects, foreign investment projects, and fixedasset investment projects;
+(3) e investment summary and analysis module in- cludes enterprise basic information summary, for- eign investment project summary, and fixed asset investment project summary;
+(4) Management of investment implementation: quar- terly progress of major projects, annual imple- mentation of projects, annual implementation of fixed asset investment projects, foreign investment projects, and high-risk financial investment;
+(5) Statistical risk data, investment risk management module shows the risk of investment projects;
+(6) e system login module provides user login. At the same time, only the system administrator can add, modify, and delete business operators. e system administrator can only add from the database [14].
+3.2.SystemUseCaseStatus.Use case diagram is a key factor in the software development engineering. It reflects the relationship between all users and system business functions in a system. e drawing of use case diagram will clearly reflecttheoperationpermissionsofdifferentusers,asshown in Figure 3.
+ e administrator of the investment data management
+system can handle the following businesses in the system: managing investment risk, managing investment project information, managing enterprise information, managing
+3
+system data, managing investment execution, user login, investmentsummary,and analysis,etc., einvestmentuser of the investment data management system can handle the following businesses in the system: management of invest- ment risk, management of investment project information, management of enterprise information, management of investment execution, user login, investment summary analysis, and other permissions [15].
+3.3. System Data Flow Requirements
+3.3.1. Top Level Data Flow.As shown in Figure 4, the top- level data flow is designed to display the data interaction process and reflecttheinvestmentdata managementsystem. e main business data processed are: investment execution data, project risk basic data, enterprise basic data, invest- ment project data, and user basic data. e data flow fully shows the flow direction of system design.
+3.3.2. Query Data Flow. As shown in Figure 5, the data information of the investment data management system for investment enterprises mainly deals with the query data, including project risk data, investment department data, system user data, and investment execution data. rough the query flow chart, the final query flow direction of the investment data is the storage table of the database, which is themainfeatureofaninformationmanagementsystem[16].
+3.3.3. System Login Data Flow.AsshowninFigure6,theuser login process of the investment data management system is established, and the window provided for user login is dis- played on the operation interface. In the test process, input their own login information first. After confirming that the information is input correctly, operate the “login” button below. e interface program will analyze whether the user informationexistsandverifytheiruseridentity. etestshows that if the login information is operated correctly, the main interface of the investment data management system will be opened,otherwise,theinterfacewitherrormessagewillappear.
+3.4. Overall System Design
+3.4.1. Network Structure Design.Since the design should meet the actual needs, the solution of the investment data management system of the investment enterprise should realize the management and analysis of the investment data management information when designing the investment data management system, and the selected network equipment should meet the requirements. is is a relatively advanced model in the industry and is composed of the data network system [17]. e manager manages the data in the database.Forthenetworkproductswidelyusedintheworld, when selecting the products of internationally well-known manufacturers and designing the network equipment of the investment data management system, the principle of safety, stability, and reliability shall be followed to ensure the smooth implementation of investment data management.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+4 Mobile Information Systems
+Functional structure of financial investment data management system
+Manage
+Enterprise basic Investment Investment Investment
+Investment System information summary Execution Risk
+Project login management analysis Management Management
+Information module module module Module Module
+Module
+Figure 2: Functional structure of financial investment data management system.
+data management system User login
+System data management
+Enterprise Information Management
+investment project management
+Enterprise administrator investment user
+Investment summary analysis
+Investment execution
+Investment Risk Management
+Figure 3: Use case diagram of financial investment data management system.
+Investment
+Investment Execution Investment
+Corporate project Information Risk
+Information information Information User Info
+data exchange
+Figure 4: Top level data flow diagram of financial investment data management system.
+Mobile Information Systems 5
+Teaching information
+Laboratory Information
+query
+Data query data
+processing entry
+Personnel information
+Instrument and equipment information
+Figure 5: Data flow diagram of data information query.
+physical enter the input Check Compare perform Complete Enter the system
+login system main
+Certification databases login
+verification page
+Figure 6: System security access data flow diagram.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+ e investment data management business data takes the front-endswitchasthebufferlibrary,integratesthedatainto the central database through the data exchange platform, accesses all hosts to the server in the internal LAN, and accesses the system with the external Internet. VPN tech- nology can be used on the Internet. For users without an external network, the data center is deployed on the external network of the enterprise. e resources of the investment data management data center can be accessed safely through theInternetnetwork,andtheusersofthenetworkcanaccess in the same network [18]. e remote control of the client can be realized through the network data exchange. e investment data management system of the investment enterprisecanactivelyinitiatetheconnectiontothenetwork and has the wired communication function between the server and the client. It can obtain the current system status oftheclientandthedataoftheinvestmentdatamanagement businessinrealtime,soastorealize thecontrollabilityofthe whole investment data management information trans- mission process.
+3.4.2. System Function Structure Design
+(1) First, Software Data Layer. Data layer maintenance is the application-oriented data existing in the system. rough the storage medium, the system-related information is stored in a certain medium and saved in a regular way. e
+upper end of the system can carry out various effective operations on the information in the database through the program software, so as to achieve the business function, data storage, and data access of the client of the investment management system. Its main core operation is the input and output of data. If these two points are handled well, the business function of a management system can be handled accurately [19]. In the investment data management system studied in this paper, various tables of relevant data are stored in the database environment. e client can call and access the information of enrollment management, plan management, personnel management, and so on.
+(2) Second, Software Middle Layer. In the investment data management system of investment enterprises, in addition to the traditional data storage mode, the database access middleware technology is also designed and used. A layer of middlewaresystemisdesignedbetweenthedatabaseandthe logic layer. Its main function is to quickly connect the business layer and the database. rough the connection of this interface, the encapsulated function events will be called when the data is input and output, which reduces the programming of the program end. It also improves the data transmission efficiency and realizes stable high-level appli- cations in the process of communication interaction. It is of great value for maintaining, transplanting, and upgrading the management system in the future expansion [20].
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Mobile Information Systems
+9
+3.4.3. Software Presentation Layer.In the business layer, the interface of software client is designed and developed through J2EE technology, and the operation code is pro- grammed. According to the design of investment data management module, the management function is designed in detail. According to the business needs, the enterprise network is established: investment summary and analysis module, investment project management module, invest- ment risk management module, investment execution module, data management module, user login module, enterprise information management module, etc., As shown in Figure 7.
+3.4.4. Risk Management Module. e design of investment project risk management function is shown in Figure 8. By analyzing the risk data existing in the implementation of the investment project, the risk problems that have been han- dled can be updated and deleted. e system user can add, view, analyze, and process the risk data of the investment project. e function of data binding, display, management, and maintenance of investment project risk realizes the maintenance of the investment risk data. Realizethedataupdate,asshowninFigure9.Executethe update operation, enter new data in it, and update the data through the inputable dialog box after completing the input. According to the security strategy of hierarchical pro- tection and combined with the characteristics of manage- ment business, the community management system should be divided according to the construction of security pro- tection system of each security domain, external network platform domain, and internal network platform domain
+[21]. e terminal machine room shall ensure safety and security: fire prevention, anti-theft, dust prevention, wa- terproof, anti-static, and anti-power failure. e security system design of the investment data management system followsthesecuritysystemmodel.Undertheguidanceofthe unifiedhierarchical protection security strategy, the security system design of the whole online management platform is divided into several important contents, such as the con- struction of security technology security system, emergency response system, and security management security system. e construction of security technology guarantee system includes security infrastructure (including unified authen- tication, password service system, trusted timestamp service system, etc.,), and security service system (monitoring and detection system, etc.,). e construction of emergency response system includes emergency response objects, processes, institutions, and other aspects. e construction of safety management guarantee system includes organi- zation, system, management means, safety audit, and so on.
+4. Key Technologies of Resource Management for Big Data Drive
+4.1. Big Data Platform Computing Framework. ere aremany computing frameworks for different scenarios of big data processing, including MapReduce parallel computing model, spark memory computing framework, and some
+streaming computing frameworks. MapReduce parallel computing model is mainly used in large-scale batch com- puting scenarios. Due to its poor performance in iterative algorithms, spark memory computing framework appears. Spark memory computing framework greatly improves the performance of data mining and machine learning algo- rithms [22]. e streaming computing framework mainly dealswiththeapplicationscenarioswithstrongreal-timeand interactive requirements. Different computing frameworks havetheirownadvantages.Alarge-scalesystemoftenfacesa variety of application scenarios, and a variety of computing frameworkscanplaytheirrespectiveroles. ispapermainly uses MapReduce parallel computing model. Traditional parallel computing models include data parallel model and messageparallelmodel,dataparallelmodelssuchasHPFand message passing models such asMPI and PVM.Whenusing the traditional parallel computing model to write programs, users need to intervene in the division of data and the syn- chronization of tasks and the burden of programmers is heavy. In order to reduce the programming difficulty of parallel processing massive data, MapReduce program can run on a cluster composed of cheap commercial machines because it does not care about the performance of a single node and has high fault tolerance [23]. MapReduce parallel computingmodel shields thedetailedimplementationofthe underlying parallel program. Users only need to use map function and reduce function to define their own business processing logic, which is simple and easy to learn, freeing programmers from the heavy burden of traditional parallel programming model, and greatly promoting the develop- ment of massive data processing and analysis ability.
+4.2. Joint Optimization of System Resources
+4.2.1. Virtual Machine and Physical Server Model. is paperassumesthatCPprovidesatotalofKdifferenttypesof VMs,wherek∈k:�{1,2,..., K}representsthektypeofVM. Each type of VM is preset with differenttypes and quantities
+of resource requirements, such as CPU, memory, and hard disk, and g(k) is used to represent the demand for VM
+resources of type k. In addition, this chapter assumes that there are m physical servers in the DC, and the resource capacity of each physical server m∈M:�{1, 2,..., M} is
+denoted by c (m).
+4.2.2. Virtual Machine Request Model.It is assumed that there are a total of H differenttypes of VM requests arriving, and each request type h∈H corresponds to different types and quantities of VMs. At the same time, this chapter as- sumesthatthenumberofdifferenttypesofVMsrequiredby each VM request is randomly distributed and independent of each other, and uses r (l, k) to represent the number of VMs of typekrequired by VM request l. erefore, the total resource requirement of VM request l can be expressed by formula (1):
+rl � r(l,k)g(k). (1)
+k
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Mobile Information Systems
+
+System front desk
+middle layer
+System background
+Network Public Opinion Database
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Mobile Information Systems
+
+Figure 7: Overall functional architecture of the system.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Mobile Information Systems
+11
+Start
+Investment execution
+no
+Is there a risk
+yes
+Display risk data
+end
+Figure 8: Risk management operation process.
+4.2.3. Income Model.Usually, the CP will bring certain benefits for each VM request it receives. is chapter as- sumesthatinstantiatingaVMoftype kcanbring p(k)toCP per unit time. Although the CP can actively reject some VMrequests so that there are enough remaining resources to accommodate subsequent VM requests with higher revenue value, rejecting VM requests will still bring certain negative impacts to it, such as affecting its reputation, etc., [24] erefore, this paper introduces a “penalty” mechanism to characterize the indirect loss caused when the CP rejects a VM request, and uses φ (k) to represent the unit time loss caused by the CP rejecting a VM of type k. us, the actual benefitthat CP obtains from VM request l can be expressed by (2) and (3):
+R(l) � ρ(k)r(l,k)τ(l). (2)
+k
+means l is accepted
+Start
+Enter new information
+no Is the input data
+canonical?
+yes
+Execute update function
+Data Update
+end
+Figure 9: Risk data update operation flow chart.
+R(l) � −ρ(k)r(l,k)τ(l). (3)
+k
+means l is rejected.
+4.2.4. Virtual Machine Request Joint Optimization Decision Making Problem. e core problem of the joint decision optimizationofVMaccesscontrolandresourceallocationis to design a strategy that can evaluate the impact of the current resource allocation decision on the future resource
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Mobile Information Systems
+21
+π sl� π( slsup) ∈A( sl)⎧⎪⎨⎪⎩R sl,π sl+ c sl+1∈SP sl + 1|sl,π slVπ sl + 1⎪⎭. (9)
+⎫⎪⎬
+ e strategy obtained by the above formula is the op-
+considered, when any VM request l reaches the DC and the CP adopts the decision, the conditional state transition probability of the system in the case of the next random event can be expressed as three cases by the following formula, as shown in formulas (10)–(12):
+timal decision π∗(s1) corresponding to each state.
+Any VM request can arrive and any VM request can leave. Since this paper assumes that the decision of any VM request is determined when it arrives, the state of the system will not change at the middle time of two adjacent random
+utilization and the potential benefits of CP, so that the comprehensive optimization decision that is the most conducive to improve the long-term benefits of CP can be selected for the currently arrived VM requests. erefore, under the joint optimization strategy, for any VM request that arrives, CP needs to consider whether it needs to be acceptedand how toallocateresources toit afteracceptance, and judge the probability of resource blocking or resource wastebyquantitativelyevaluatingtheimpactofthisdecision on subsequent decision-making. Maximize the benefits of the final decision [25].
+ e goal of VP problem is to design an optimal decision function π∗, so as to maximize the expected discounted
+revenue (EDR) of CP in a long time, as shown in (6):
+maxRπs0 � Eπs0⎧⎨⎩∞ Rl sl,π slctP s + 1|s ,a � λh + 1 � h,s + 1 � s + a (10)
+events. erefore, CP nly needs to make corresponding decisions on the VM request when it arrives. us, the state transition probability of the system can be defined as the probability that the next random event is the arrival of VM request or the departure of any deployed VM request under a given system state and its corresponding decision. Since the resource reallocation of deployed VM requests is not
+l l l λ sl,al,pl l l l,
+P s + 1|s ,a � nh′μh′ + 1 � 0,s + 1 � s + a −ah′
+l l l λ sl,all l l l l′ ,
+,p
+(11)
+⎭. (6)
+⎫⎬
+l�1
+ e joint optimal strategy of virtual machine access control and placement can be expressed as (7):
+π∗ � argmaxRπs0, π ∈II. (7)
+
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt b/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt
new file mode 100644
index 0000000..e8b1bc5
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt
@@ -0,0 +1,188 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Physica Medica 69 (2020) 28–35
+Contents lists available at ScienceDirect
+Physica Medica
+journal homepage: www.elsevier.com/locate/ejmp
+Original paper
+A systematic quality assurance framework for the upgrade of radiation
+T oncology information systems
+Baoshe Zhang ⁎, Shifeng Chen, Warren D. D’Souza, ByongYong Yi
+Department of Radiation Oncology, University of Maryland School of Medicine, Baltimore, MD 21201, USA A R T I C L E I N F O A B S T R A C T
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Keywords:
+Quality assurance
+Radiation oncology information system Clinical data integrity and safety Radiation oncology data management Integrated oncology system
+In spite of its importance, no systematic and comprehensive quality assurance (QA) program for radiation on- cology information systems (ROIS) to verify clinical and treatment data integrity and mitigate against data errors/corruption and/or data loss risks is available. Based on data organization, format and purpose, data in ROISs falls into five different categories: (1) the ROIS relational database and associated files; (2) the ROIS DICOM data stream; (3) treatment machine beam data and machine con figuration data; (4) electronic medical record (EMR) documents; and (5) user-generated clinical and treatment reports from the ROIS. For each data category, this framework proposes a corresponding data QA strategy to very data integrity. This approach verified every bit of data in the ROIS, including billions of data records in the ROIS SQL database, tens of millions of ROIS database-associated files, tens of thousands of DICOM data files for a group of selected patients, almost half a million EMR documents, and tens of thousands of machine con figuration files and beam data files. The framework has been validated through intentional modi fications with test patient data. Despite the big data nature of ROIS, the multiprocess and multithread nature of our QA tools enabled the whole ROIS data QA process to be completed within hours without clinical interruptions. The QA framework suggested in this study proved to be robust, ffie cient and comprehensive without labor-intensive manual checks and has been im- plemented for our routine ROIS QA and ROIS upgrades.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+1. Introduction
+With the advancement of computer technology and the transition
+from paper-based medical records to electronic medical records (EMRs)
+[1 3], radiation oncology information systems (ROISs) [4] have be- come increasingly complex and data-intensive. Their functionalities
+have been extended from a simple record-and-verify system [5] to a comprehensive radiation oncology patient care system with numerous subsystems, such as patient image storage, patient demographics, treatment scheduling, treatment delivery and records, follow-up visits, and even treatment planning. ROISs are playing a pivotal role in im- proving patient care regarding e fficiency and safety [4] , as well as re- ducing the error rate in the clinic [2,6,7]. However, a ROIS, as an emerging complex technology, may face new challenges and introduce
+a new venue for errors [6,8]. Therefore, quality assurance (QA) issues for ROISs have been raised in the radiation oncology community [7,9].
+There are occasions that can put ROISs at high risks, such as, a software upgrade or hardware change [10], which might be in company with database migration. Because of the complexity of patient data and
+hybrid database storage architecture, database migration is becoming
+much more complex and risky. A clinical ROI system provides treat- ment parameters (such as gantry angle, collimator angle, couch angle, jaw position, multileaf collimator position, monitor units, etc.) to a treatment delivery system (such as linear accelerators) and then records all treatment histories and activities. If any of the treatment parameters is accidentally modi fied in the database during the ROIS upgrade, treatment will deviate from the intended plan, with consequences that could harm patients and/or lessen treatment e ffectiveness. An intensity- modulated radiation treatment/volumetric-modulated arc therapy plan might include thousands of treatment parameters, so that it is almost impossible to check these manually as was done in the past. Despite vigorous software QA by the vendors of ROISs before the release of a new version, it is still the responsibility of clinical physicists and IT group members to check and con firm their own data integrity. As a type of medical device, ROISs deserve a comprehensive QA method like any other equipment in radiation oncology. However, few how-to instruc- tions or recommendations for ROIS QA methods have been published [13]. Therefore, it is crucial to perform a series of QA for checking consistency during a ROI upgrade and the QA procedure should be automatic for a practical reason.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+⁎ Corresponding author.
+E-mail address: bzhang4@umm.edu (B. Zhang).
+https://doi.org/10.1016/j.ejmp.2019.11.024
+Received 17 March 2019; Received in revised form 8 November 2019; Accepted 26 November 2019 1120-1797/ © 2019 Associazione Italiana di Fisica Medica. Published by Elsevier Ltd. All rights reserved.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+This article presents a systematic QA framework for veri fication ofROIS information integrity after a signi ficant change happened to ROIS, such as ROIS software or hardware upgrades or data migrations.
+2. Methods and materials
+This framework mainly focuses on clinical data sources and struc-
+tures in ROIS. All data are categorized into five kinds: the ROIS SQL [11] database and its associated files, ROIS DICOM [12] data streams, ROIS machine data files and con figurations, EMR documents, and clinical reports generated from the ROIS. The principle of the QA fra- mework compares these five data sources and data structures between ROIS states. Once data integrity is veri fied, an end-to-end test is per- formed to further check connections and interfaces between the ROIS system and other clinical systems (such as treatment planning systems, treatment control consoles, and hospital information systems).
+2.1. ROIS relational database
+From time to time, due to performance improvements, security concerns, or bug fixes, a ROIS relational database (see Appendix I for details) system would be upgraded. Sometimes, it involves data mi- gration. Usually, data migration occurs in the following situations but
+not limited to: (1) the vender strategically changes partnership with commercial database software companies or simply adopts a new da- tabase server architecture based on performance and features; (2) the vendor simply adopts a new hardware and relocates data from a legacy storage to a new data storage, or from a server to another; (3) the vendor redesigns their database schema and architecture and needs to move data from the legacy databases to the new databases. During ROIS upgrades, possible data risks include implicit data loss and explicit data
+loss, data corruption, and corrupted data relationships.
+In order to verify migrated data in databases, the first step is to compare database schema to figure out how data have been re- structured and migrated from the legacy database to the new database
+and how data relationships have changed for example, to identify any added or deleted data columns or tables or any data type change for a
+data column. An existing data column may move to a di fferent data table, or a data table or column may be renamed. Moreover, data ag- gregations or data splits may have occurred. Such a database schema change is illustrated in Fig. 1. Here, a new data table C in the new database contains data from tables A and B in the legacy database. This diagram also shows that a data column being moved from the legacy database might end up with a di fferent data column name in the new database.
+
+Fig. 1. Diagram for database schema change. Data table C is in the new data- base, and data tables A and B are in the legacy database. Data column c1 in data table C contains the same data from data column a1 of data table A, and so on for data columns c2, c3, and c4.
+
+Fig. 2. Database schema comparison. Here A represents the legacy database, and B represents the new databases. Region (c) represents common data ex- isting in both databases, region (a) represents data removed from B, and region (b) represents new data in B.
+According to database schema changes, data comparison between
+two states of databases can be implemented by either creating data views or designing complex data comparison statements. In our im- plementation, we used A-B and B-A (A and B are datasets from an SQL query statement for legacy databases and for new databases, re- spectively) to identify di fferences between A and B. In Fig. 2, region (a) represents the data that exist in the legacy database but not in the new database (A-B); region (b) represents newly created data that never existed in the legacy database (B-A) and region (c) represents data that exist in both the legacy database and the new database (A ∩ B).
+It is time-consuming and technically challenging to compare big and complex databases. In order to speed up data comparison, concurrent multi-process or multi-thread techniques should be used to process sectional database. A ROIS system might be composed of several da- tabases. Each database might have hundreds or thousands of data ta- bles. Since database servers support parallel data access, each con- current process or thread can handle a portion of a database. For a big data table, its data comparison can be distributed among multiple processes or threads by carefully splitting the data table into multiple sections.
+2.2. ROIS DICOM interface
+DICOM is a de facto standard in medical fields, including radiation oncology, for patient data exchange and storage, such as exporting radiation therapy (RT) information (e.g., contours, treatment plans, dose distributions of treatment plans, treatment records and radiation therapy images) to a clinic linear accelerator. A ROIS exchanges patient demographic information and radiation treatment information with other radiation oncology systems through DICOM data streams. Although relational databases are the ultimate patient data storage, the information in these databases must be converted into a DICOM data stream before being sent to other systems, such as sending treatment plans to a treatment delivery system. In addition, the ROIS receives information from other systems through its DICOM interface, then converts and stores the information in its relational databases.
+DICOM data streams group information into data sets and use three
+different element encoding schemes. It has a 2-byte field for informa- tion group specifying information class (such as patient information), a
+2-byte field for information element specifying a particular data (such
+as patient name), a 2-byte field for data type (such as, ST indicates that the data type is short text.). Further, DICOM uses sequences to create nested data structures to store complex attributes. DICOM stream has some time stamps, such as DICOM object creation time. Therefore, even
+for the same DICOM object, two DICOM exports will produce two dif- ferent DICOM data streams. In DICOM data comparison, we only compare essential information instead of comparing every bit contained in DICOM data stream. For example, when two DICOM RT-plan data streams are compared, DICOM object instance creation time will be
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+
+Fig. 3. DICOM interface of ARIA ROIS.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+ignored but other information (such as plan parameters and referenced structure and referenced patient information and various DICOM un- ique identi fiers) will be compared.
+DICOM objects (such as RT-Plan) for a group of selected patients are
+automatically exported from the relational databases through the ROIS
+DICOM interface and stored in the file system by a DICOM storage server (Fig. 3) for two ROIS states, such as pre- versus post-upgrade.
+Then the uniform identi fications (UID) of DICOM service-object pair (SOP) instances are used to pair DICOM files between ROIS states. A DICOM comparison tool will read each data element from a pair of
+DICOM files for comparison, and then generate a comparison summary
+report (Fig. 4a and Fig. 4b and Fig. 4c). The procedure not only checks
+to determine whether the ROIS DICOM interface is working properly but also implicitly veri fies data in the ROIS databases.
+2.3. Beam data and machine con figurations
+When treatment machines, such as clinic linear accelerators, are commissioned, a set of machine model parameters are generated based on clinical measurements. These parameters are used for beam mod- eling, dose calculation, treatment plan validation, etc. Individual sites might have di fferent preferences in machine settings and con figura- tions. To verify machine data and con figurations, our approach is to generate an MD5 hash string for each data file between ROIS states.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+
+Fig. 4a. Snapshot of a DICOM comparison report. In this instance, all plan parameters and treatment records are identical.
+
+Fig. 4b. Sample report of DICOM RT-Treatment Record changes. In this instance, treatment records have been changed but the plan parameters are identical.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+31
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+Then these MD5 hash codes are compared to determine if the machine data files are intact. If machine data changes occur, our approach is to obtain the file format information from the manufacturer to compare data and determine what kinds of changes were made. For example, if machine data are saved in XML, an XML file parser is used to compare changes of critical information.
+2.4. ROIS static files and EMR documents
+Relational databases usually store big trunks of binary data (such as
+images, doses, contours, etc.) as disk files in patient folders. The con- tents of these files are not modi fied frequently during routine practice and are kept intact, as are the contents of EMR documents. Because of
+the very large numbers of these files with terabytes of disk storage, it is not practical to generate a separate copy of all these files for each ROI state. Our strategy is to generate an MD5 hash string for each such file between ROIS states and then compare paired MD5 hash strings to determine whether any such file has been corrupted or altered.
+2.5. User-generated documents in ROIS
+User-generated documents are usually template-based and can be
+generated from information in the ROIS relational databases, such as
+patient appointments during a period of time, radiation treatment his-
+tory, a list of patients under a speci fic treatment protocol, etc. These reports use common file formats, such as Microsoft Excel, Word, or PDF, so that they can be viewed by third-party software. Our approach uses
+file parsers to retrieve information from these reports and compare
+them between ROIS states to make sure that information in these re-
+ports is identical and accurate. In our clinic, comparison of these reports is automatically performed by in-house built Excel, Word, or PDF file parsers.
+2.6. Mode-up test and end-to-end test
+After data integrity testing, a mode-up test and an end-to-end test
+are performed following clinical work flow (Fig. 5). Therapists loaded each treatment beam of the plans for under-treatment patients into the treatment machines to con firm whether the plans are deliverable. The end-to-end test uses a phantom patient and follows the treatment pro- cedures from CT simulation scan to treatment delivery. All treatment records, including captured images and treatment history, are checked. During this entire end-to-end test process, data in each step are
+carefully veri fied. The end-to-end test will not only check the essential ROIS software functionalities but also help to con firm the connectivity between ROIS and other clinical systems.
+3. Results
+The radiation oncology practice at the University of Maryland Medical System includes five photon sites (a main campus and four community practices) and a proton site; and all sites share a single ARIA (Varian, Palo Alto, California, USA) ROIS. Both of the QAs with our novel method following upgrades from version 11.2 to 11.5 in early
+2014 and from version 11.5 to 13.7 with the proton modality in late 2016 showed that this framework is reliable and e ffective.
+Both ARIA upgrades and QA were performed over a single weekend. Prior to the upgrades, an XML file describing the SQL database schema changes was generated from both the legacy version and the new ver-
+sion of ARIA. Once the clinics closed on a Friday afternoon, the QA program generated MD5 hash string for each database-associated file and each EMC document. Another QA program commanded the ARIA DICOM interface to export treatment plans and treatment records for all under-treatment patients. The pre-upgrade SQL databases of the ARIA ROIS were kept for comparison. Physicists, dosimetrists, and therapists generated clinical reports used for routine practice for later comparison.
+A copy of machine con figuration files and beam data files of each treatment machine was kept for later comparison. Together, all of these
+tasks were completed in 2 3 h. The ARIA ROIS upgrade was then started by the vendor application specialists. After upgrade, the SQL database comparison software started to compare databases table by table and record by record between the pre- and post-upgrade data- bases guided by the schema change XML file of the database. In parallel, the ARIA DICOM interface was commanded to export treatment plans
+and treatment records for the same patients as those prior to the up- grade. A DICOM comparison program paired DICOM files according to DICOM Instance UIDs and then compared detailed information between paired DICOM files. An MD5 hash string was generated for each data- base-associated file (such as image file, dose file, contour file, etc) and each EMR document, followed by comparison of corresponding pre-/ post-upgrade MD5 hash strings. Another program parsed machine configuration files between pre- and post-upgrades. Clinical and treat- ment reports with the same criteria were exported from ARIA and compared against their pre-upgrade counterparts. All comparison tasks
+were completed on a Saturday. The summary of the comparison results
+was presented to the chief physicist or the upgrade QA team lead for
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+
+Fig. 4c. Sample report of DICOM RT-Plan changes. In this instance, plan parameters have been changed but the treatment records are identical. Here, beam type for all treatment beams was changed from STATIC to DYNAMIC.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+review. When doubts were raised, the vendor s application specialists were contacted for consultation. Should any doubt or suspicion not be resolved satisfactorily, the ARIA ROIS would have been rolled back. Once data QA was performed successfully, the vendor s application specialists came on-site to perform acceptance tests in the presence of local physicists and/or IT personnel. On Sunday, representatives from each functional group, including physicists, dosimetrists, therapists,
+and physicians, performed the mode-up tests and an end-to-end test. Once these tasks had been successfully completed and documented, the
+new ROIS was o fficially released for clinic use.
+In order not to compromise any clinical patient data, test patients
+are used. All of the modi fications have been detected and it was pos- sible to identify the sources of di fferences using the reports generated from the QA proves. For instance, a series of parameters of a beam from
+a treatment plan has been modi fied, including monitor unit value, collimator angle, couch angle, jaw field sizes, MLC leaf positions, ap- pointment schedule. These changes will result in exported DICOM RT-
+Plan changes (Fig. 4b and Fig. 4c and Fig. 6) and will also result in database changes (Figs. 7 and 8).
+The system successfully detected true-positive components which have been intentionally added during the upgrade procedure under a test ROIS environment. The error components were a modi fied delivery plan, an altered treatment history, deletion of an image, addition of an electronic medical record and omission of a patient. During the 2014 upgrade, we veri fied 1,638 data tables with 2.4 billion data records, 1.86 million ARIA database static files, and 43,153 EMR documents. For 222 patients under treatment, 605 pairs of DICOM RT plans and 13,480 pairs of DICOM treatment records retrieved from the ROIS DICOM in- terface were compared. 83 new data tables were identi fied. 74 existing data tables had new data columns added, and 4 data tables from the previous version were removed. Meanwhile, two existing data tables
+were consolidated into a data table. Reports for 5,073 patient en- counters over a 2-week period were compared and determined to be identical to those before the upgrade. Contents in 12,237 machine files
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+
+Fig. 5. Clinical work flow for the end-to-end test with a phantom patient.
+
+Fig. 6. Sample report of DICOM RT-Plan parameter changes. In this instance, multiple plan parameters have been altered.
+were compared, and no di fferences were found between pre- and post- 4. Discussions
+upgrade states. It took about 2 h for pre-upgrade preparation and about
+8 h for post-upgrade QA. Data migration errors in radiation oncology have been identi fied as
+During the 2016 upgrade, we veri fied 1,891 data tables with 4.4 emerging issues by the World Health Organization [13] , and ROIS billion data records, as well as 9.45 million ARIA database static files software upgrades or changes have been identi fied as imposing high and 493,034 EMR documents. For 351 under-treatment patients, 1,104 risk [10]. The International Atomic Energy Agency Human Health Re- pairs of DICOM RT plans and 22,046 pairs of DICOM treatment records port No.7 [14] recommended that quality control be performed after were compared. 165 new data tables and 94 amended or deleted tables record-and-verify system upgrades. However, the relevant QA tools are were identi fied. Reports for 8,452 patient encounters over a 2-week far behind emerging technology. Until now, the majority of QA checks period were compared and were identical to those before the upgrade. in ROISs have been performed via manual checks, such as pre-treatment Contents in 26,165 machine con figuration files and beam data files measurements or spot checks [15] . Because of increasing data quantity were compared, with no di fferences identi fied. It took about 3 h for pre- and complexity, such manual checks can assess only a tiny fraction of upgrade preparation and about 8 h for post-upgrade QA. patient data for contemporary ROIS systems with EMR functions. A
+
+Fig. 7. Sample summary report of database changes.
+Fig. 8. Sample report of detailed database table changes. This figure shows two corre- sponding table rows from table
+ dbo.ExternalField between two ROIS states. Here, RadiationSer represents the primary key of table dbo.ExternalField . All other columns (such as, GantryRtn, CollRtn)
+represent attributes of table dbo.ExternalField . Due to space limitations, not all the table columns are listed here.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+34
+B. Zhang, et al. Physica Medica 69 (2020) 28–35
+comprehensive and automated QA tool is imperative for maintaining
+and verifying patient data integrity in the era of big data.
+Clinical implementations of automated QA tools have been reported
+for initial chart checks [16 19] . Hadley et al. [20] used an automated tool for veri fication of treatment plan parameters after ROIS upgrade and database migration. The transition from conventional manual checks toward automation of patient data QA is challenging. As ra- diation oncology practices migrate from paper-based medical records to EMRs and the integration of ROIS and hospital information systems advances, information stored in the ROIS has been signi ficantly in- creased, further complicating information relationships. The ROIS now includes all kinds of patient data and related data, such as patient de- mographics, clinic appointment schedules, diagnosis codes, treatment
+plan and delivery records, planned and delivered doses, along with clinical notes in the form of text documents. In an integrated oncology environment, none of the information is of less importance than others, and con firmation of integrity is crucial for safe practice.
+Although our automated QA tools check every bit of data, thanks to
+the utilization of multiprocess and multithread techniques, the entire procedure of database integrity QA and other data QAs were able to be completed within hours without clinical practice interruption.
+End-to-end tests following the clinical work flow, from CT simula- tion to treatment delivery, are helpful for detecting any issue related to ROIS interconnectivity with other clinical systems and to assess major
+components performances.
+Although we only applied this framework to ARIA upgrades, the
+framework can be seamlessly applied to other ROISs. Also, this fra-
+mework can be trimmed to cater to routine ROIS QA or a di fferent scenario, for example, only DICOM QA check is needed if only a DICOM
+upgrade was performed for the ROIS. This framework proposed here is
+very instrumental in paving the way to a widely accepted quality as- surance program for modern radiation oncology information system within the radiation oncology community, not only during speci fic events, such as upgrade or data migration, but also on a routine basis,
+such as, quarterly or yearly.
+The main purpose of this framework is to verify data integrity be- tween two ROIS states. It is not designed to check any dynamic data update in ROIS databases. Therefore, during the execution of this fra- mework, the ROIS software should be kept from updating the ROIS database, such as addition/deletion of a database table record or an EMR document. Such updates from the ROIS software will alter the ROIS database to change the ROIS state, which will lead to unreliable results. Although this framework can implicitly check some ROIS soft- ware functionalities and behaviors, it should not be used as a complete ROIS software QA tool. The ROIS software functionality QA should be fully performed by the vendors.
+
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+35
diff --git a/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt b/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt
new file mode 100644
index 0000000..23bb037
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt
@@ -0,0 +1,158 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+2020 IEEE International Students' Conference on Electrical, Electronics and Computer Science
+Testing MapReduce program using Induction Method
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
+
+Ashish Kumar Rai
+Department of Computer Science and Engineering Kamla Nehru Institute of Technology (KNIT), Sultanpur, UP, INDIA
+email.ashishrai@gmail.com
+Abstract—MapReduce is “divide and conquer” applied paradigm for processing large volume of data to filter out information to solve day to day complex challenges. MapReduce is core of big data applications. The challenging part to test these applications which also represent the characteristic of these applications are variation in data due to different format and sources. In other words, poor quality of input data can deviate system towards failure if not handled properly programmatically for variety of input data. MapReduce program itself based on transformations at different level based on the program logic This paper proposes the testing technique based on the mathematical induction principle and considered as extension or conjunction other testing techniques already in used either based on transformations analysis from input to output as in MRFlow. Proposed function testing can be used in business acceptance testing and showcase the correctness of program, further can detect many defects even before shipping bigdata application in live.
+ Keywords—MapReduce, Data Defects, Induction, MapReduce Testing, MapReduce business acceptance testing.
+I. INTRODUCTION
+Software testing is the process of finding error or defect in program or finding deviation (if any) in expected behaviour or end result. The purpose of this exercise is to improve the quality of software and reduce related cost of defect fix if encountered in live environment. To test bigdata application individual testing required in each stage from extraction of data, loading data in HFDS, transformation and utilization of data as per business requirement and further representing report or dashboard. To meet envisioned purpose of business application it is equally desirable to perform functional and non-functional testing. MapReduce should be considered as layer of bigdata application where key business rules get implemented. This makes testing of MapReduce as key factor for successful of the bigdata implementation.
+Lecture “Big Data Essentials: HDFS, MapReduce and Spark RDD” available on coursera website, suggests performing unit, integration, system and acceptance testing [3]. This paper proposed another approach of functional testing based on mathematical induction principle and help to showcase correctness of MapReduce program. This approach should be considered as harmonizing other method used to perform functional testing of MapReduce application.
+As per book Concrete Mathematics, Scientific acceptance of mathematical induction has already discussed in different articles and can be understood with example that we will climb as tall as we like on a stepping stool, by demonstrating that able to climb onto the foot rung (the premise) which from each rung we are able climb up to the following one (the step)[4].
+Dr. A. K. Malviya
+Department of Computer Science and Engineering Kamla Nehru Institute of Technology (KNIT), Sultanpur, UP, INDIA
+anilkumarmalviya@gmail.com
+This metaphor helps to utilize mathematical induction to solve by formal verification.
+The remaining paper is organized as follows: section2 describe about MapReduce paradigm, techniques, tools used for MapReduce and related work done in this area. Next section 3 proposed techniques presenting in this paper along with mathematical model of Induction method. Section 4 is case study which showcase the example of proposed MapReduce testing technique. Further section is conclusion notes for this paper.
+II. BACKGROUND
+As per press release on September 11, 2017 Gartner’s Hyper Cycle revealed that big data would achieve mainstream maturity within two to five year. This indicate wider acceptability and future technology in IT as bigdata application to support business need and identify hidden potential opportunities. Big Data shown high level of acceptance and maturity where MapReduce is intrinsic core framework for big data applications [1].
+
+Fig. 1. Gartner’s Hyper Cycle
+The three Vs - Variety, Volume and Velocity (sometime includes Veracity) - are commonly used to describe different aspects of big data or commonly known as Characteristics of Big Data. Sensors & Devices, Social Media, Enterprise and Internet are contributing exponential growth in data volume. With a rough estimation more than 2 trillion gigabytes of data created daily and need high velocity processing. The data may be structured and unstructured with diversify source such as error log, IoT, data from social networks includes but not limited to image data, recordings, visuals, spreadsheet data,
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
+
+978-1-7281-4862-5/20/$31.00 ©2020 IEEE
+text and many more. To resolve the 3Vs challenges of bigdata,
+Hadoop is presented as a solution. As per Wikipedia &
+Apache, Hadoop provides framework for distributed storage B. Testing MapReduce
+and processing by using MapReduce and can be considered as Coursera lecture “Big Data Essentials: HDFS, MapReduce collection of multiple open source utilities to solve problem and Spark RDD” suggest multiple level testings need to which requires more computation and/or storage. Before be performed for MapReduce application - unit, finding test approach and strategy for bigdata application, one integration, system and acceptance testing [3].
+must understand that big data is not only about data volume. It Unit Testing – Unit testing for MapReduce program can should be considered more as verification process at each step be done separately for mapper and reducer function and and include functional and non-functional testing. Source level can be run on local node. This includes white box validation to verify correct extracted data loaded in HDFS, texting of code. Different tools available to test mapper Validation of MapReduce to verify business logic validation on or reducer function such as MRUnit [20] and Junit [21]. local node (or single node) and then validating on multiple Apart from mapper and reducer, MR Jobs can be tested nodes with validation of output target data to meet business locally on single JVM.
+outcome. This paper proposed first attempt testing MapReduce Integration Testing – Once unit testing completed for based on mathematical induction and can be considered as part individual mapper and reducer function, integration of extended functional testing which provide further confidence testing should be performed on local machine validating on the correctness of MapReduce program and showcase output of mapper function is getting accepted by transformations are as expected. reducer function. Further Reducer should be able to
+process data as per design.
+A. MapReduce System Testing – After completion of integration testing, system testing should be performed and more
+Define MapReduce is a framework to perform parallel likely on distributed environment, both functional and processing on large data stored in distributed over large number non-function testing should be completed before of machines. Each machine computes data stored locally, handling over application for acceptance testing. which in turn contributes to distribute and parallel processing. Function testing take cares of the business requirement The MapReduce follows the "divide and conquer" principle and validate if application is meeting functional aspects [15] where dividing problem to subproblem can be considered while non-functional testing focus on validation of as Map while collating results from subproblem can be performance aspects and volume capabilities of considered as Reduce. With advancement of Hadoop application.
+framework as Hadoop2.0, MapReduce is more focused on data Acceptance Testing – This level of testing is performed processing while in Hadoop1.0 it was overloaded with cluster just before shipping application in live environment and resources management which is now handled by Yarn [5]. show case the application is working as per agreement
+and compliant with business requirement. Most of the
+MapReduce consists of two steps: time it should be performed by business users (or mix of
+(1) Mapper tester along with business user) and considered as
+(2) Reducer consent of acceptance for software application. So, Mapper function processes input data and convert them to MapReduce application should be tested in live like intermediate set of data, generally documented as key- value environment, generally black box testing approach is pair tuple, and further Reducer consume these key-value pair applied for this kind of testing [8].
+and combine or process them in smaller set of tuples.
+C. Related Work
+In logical terms, Map function applied on key value pair and MapReduce programs and their testing have been studied returns list of different key value set while Reduce function with different domain like finance, retail, health, defense consume this output and process them as another collection of [9][10] and found multiple challenges [18]. Most of the Big value for given key. The multiple process of mapper and Data applications are developed on top of the MapReduce reducer run in parallel on different node of Hadoop cluster programs [15] which process variety of data having multiple locally to solve large volume big data problem. sources consisting large volume and should be processed in high velocity. While Camargo and Vergilio studied MapReduce program testing and presented observation in their
+paper [16].
+ Authors L. Bu and Y. Xiong in their work tried to cover reachability testing in MapReduce program which run in concurrent distributed environment [11]. The paper showcases the design and implementation of a parallel reachability testing approach based on Hadoop MapReduce (PRT) with dynamic loading.
+On the other paper, Authors worked on the detection of design fault in MapReduce where test data executed in parallel depends on test input data and test configurations. Authors
+Fig. 2. Map Reduce logical workflow
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
+
+propose MRTest testing based techniques presented in paper to automate detection of configuration and design fault [12].
+With reference to [13], authors propose a testing technique for different infrastructure configurations execution of test cases on various input data to find out infrastructure related issue or environmental issues. The testing technique helps to automate validation through test engine and applied on real world example.
+Authors propose approach to test security policies for MapReduce [14]. Authors suggest FSM formalization for MapReduce in consideration of security policies specification conforming XACML language.
+Chen, Ganapathi, Griffith and Katz studied MapReduce and presented paper with their finding as performance evaluation for MapReduce [17].
+Moran, Riva, and Tuya in paper “MRTree: Functional testing based on MapReduce’s execution behaviour”, showcases the functional testing method for MapReduce program based on tree node navigations depth and breadth coverage to find out potential faults in MapReduce program [19].
+
+Fig. 3. Word count program - Reduce function
+Moran, Riva, and Tuya in another paper “Testing data transformations in MapReduce programs” discussed approach to test MapReduce program based on data flow and proposed testing technique as MRFlow to analyze transformation in MapReduce program by depicting graph to cover different cases and to reveal defect [22]. For given WordCount program [7], authors presented MRFlow graph based on data flow.
+
+Fig. 4. MRFlow graph for Reduce function
+In paper "Towards Ex Vivo testing of MapReduce applications”, authors suggested "Ex Vivo" context independent test approach to detect faults based live data and run on different environment [23]. On the other hand, in another paper authors systematically searches for bugs in MapReduce program and generates test cases [24].
+The author tries to showcase properties of inductive inference for showing correctness of program and using this for software testing [25].
+III. PROPOSED TESTING TECHNIQUE
+From acceptance testing prospective, considering the complexity of MapReduce program, it is hard to test and verify if program is running correctly and application is working as per business requirement. Most of the time acceptance testing is done as black box testing with minimal code structure knowledge. To support acceptance testing of applications based on MapReduce program, an approach can be adopted which is influenced by mathematical induction. It suggests that for given domain if it can be proved that application is working fine for base case, data set and incremental data set as expected, application or program is more likely correct and conform to business requirement. In more simple words, induction proof supports program correctness. Online resource [27] further provides some example using induction to verify and prove correctness of program.
+A. Matematical Induction
+Finding mathematical results based on mathematical principle to showcase its larger applicability: an assertion A(i) for natural number i can be proved if base or initial case A(1) is true and assuming it is also true for A(n) where n is any other natural number n but it can be proved true for next natural number n+1 implies that A(n+1) is also true. The proof of initial case A(1) is the first step while proof of A(n+1) is called the induction step and n is called the induction parameter .It is basis for inductive definition [26]. The proof can be represented as following steps:
+1. The base or initial case: proving statement holds for 0 or 1.
+2. The induction step: with assumption statement holds for n and proving statement holds for n+1.
+Axiom: P(0/1)&∀x(P(x)⊃P(x+1))⊃∀x P(x).
+B. Applied Testing Technique
+So far mathematical induction is used to prove program correctness using formal method or logical inference. Other approach based on induction is inductive testing. But we recommend using the applied understanding of mathematical induction for acceptance testing MapReduce application in combination black box approach. Since acceptance testing is performed business user or mix of tester along with business user. Following suggested algorithm can be used to test MapReduce application
+Algorithm
+Step 1. Run Application for primitive value which is
+NULL
+Step 2. Validate that the application is giving correct
+output with NULL value
+Step 3. Run Application for primitive value which is
+Zero
+Step 4. Validate that the application is giving correct
+output with Zero value
+Step 5. Run Application for base value which is minimal
+data (or data set)
+Step 6. Validate that the application is giving correct
+output with minimal data set
+Step 7. Run the application for given data set X and
+record the output for further analysis
+Step 8. Add ΔX (delta) in given data set x
+Step 9. Run the application for X + ΔX data set
+Step 10. Compare the output with step 7
+Step 11. Validate if data is as per the acceptance criteria Step 12. Output in Step 11 is as per the acceptance criteria Step 13. Iterate the program from step 7 for other data sets
+(variety of data) and validate
+Step 14. Validate output for other data sets to see
+correctness of the program
+CONCLUSION
+The proposed testing technique is simple but effective to find bugs in MapReduce program without worrying about architectural complexity of underlying framework. It provides confidence for program correctness and validation results for acceptance testing ensuring meeting business functional requirement in live like environment. The MapReduce programs are more prone for defects due to incorrect validation, data type mismatch or following wrong processing for key value pair or exception handling. Even sometime defects can be for incorrect business calculations. These defects may cause program failure or may have business impacts. The proposed technique provides test cases for exception such as primitive cases along with validating them against business requirement for given data set show casing program correctness.
+As future work we plan to apply sampling for variety or voluminous data or finding acceptance index for iteration on data set, further it can be automated with inclusion of machine learning for test coverage and execution.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
+
+Depending on business requirement or logical inference
+base case can be identified which represent minimal data set on
+which program run. Step 1 and 3 validate program for NULL REFERENCES
+and Zero to provide a fair chance to check negative test [1] Gartner press release https://www.gartner.com/en/newsroom/press- condition if MapReduce program is built considering no input releases/2017-09-11-gartner-hype-cycle-reveals-the-digitalization-of-
+or blank data. Since we are doing acceptance testing, output for the-supply-chain
+primitive cases for Zero or NULL along with base case can be [2] Weyuker, E. J. ‘Assessing test data adequacy through program validated based on business logic. For other input and output inference’, ACM Transactions on Programming Languages and data business may have defined domain for input and Systems, 5 (4), (1983) , 641-655.
+corresponding range values for output. Step 7 recommends [3] Chtotpusr:s/e/wRAww.courseMra.aoprRg/eldecutcuer e/big-datTae-esstisnegn tials/testing-t48UaLecture running application program for given test data set and record [4] Ronald L. Graham, Donald E. Knuth, and Oren Patashnik ‘Review of
+results considering it is inline as per business expectation. Now Concrete Mathematics: A Foundation for Computer Science, 2nd Step 8 suggests adding a known Δ (delta – small) value in input edition’Pg3 margin (1989)
+data set X and validate if output changes are corresponding [5] Hadoop: open-source software for reliable, scalable, distributed input Δ changes in conjugation of output of step 7. Step 11 and computing. http://hadoop.apache.org/.
+12 helps in validation of input and output matching with [6] Institutions that are using hadoop for educational or production uses. corresponding domain and range along with meeting business http://wiki.apache.org/hadoop.5.
+logic of application. [7] Wordcount 1.0. http://hadoop.apache.org/docs/r2.7.0/hadoop-
+mapreduce-client/hadoop-mapreduce-client-
+Since MapReduce program usually run on variety of core/MapReduceTutorial.html#Example:_WordCount_v1.0
+volume data step 13 and 14 helps to iterate program for other [8] IEEE draft international standard for software and systems engineering– variety of data. To find how many iterations required sampling software testing–part 4: Test techniques, 2014.
+or acceptance index can be identified. This converge [9] Schatz, M. C. Cloudburst: highly sensitive read mapping with acceptance testing objective to find program correctness and mapreduce. Bioinformatics 25, 11 (2009), 1363–1369.
+validating application for meeting business requirement. [10] Kocakulak, H., and Temizel, T. T. A hadoop solution for ballistic image analysis and recognition. In High Performance Computing and
+Simulation (HPCS), 2011 International Conference on (2011), IEEE, pp.
+IV. CASE STUDY 836–842..
+While exploring the applicability of proposed testing [11] L. Bu and Y. Xiong (Eds.): SATE 2018, LNCS 11293, pp. 173–184,
+2018.
+techniques, it has been applied on popular know example of
+MapReduce program WordCount[7] which is program written [12] "JAesuútos mMatoircánT,eAstnintgonoiaf BDeerstioglninoF,auClltasuidni oMdaepRlaedRuivcea AanpdplJiacvatiieornTs"uyina to find the frequency of every word in input text. To test IEEE Transactions on Reliability(2018) pp. 717-732.
+WordCount program at unit level authors Moran, Riva, and [13] J. Morán, B. Rivas, C.D.L. Riva, J. Tuya, I. Caballero, M. Tuya suggested different testing techniques MRFlow based on Serrano,"Configuration/Infrastructure-aware testing of MapReduce data flow [22]. But approach suggested in this paper is programs", Advances in Science, Technology and Engineering Systems
+primarily for acceptance testing and successful to find bug such Journal, vol. 2, no. 1, (2017) pp. 90-96.
+as given program fails for primitive case NULL where no input [14] Sara Hsaini, Salma Azzouzi and My El Hassan Charaf "FSM Modeling file is given. Program is again validated with text file not oCfonTfeesretinncge (2S0e1cu9r)itpyp . P1o4l8ic0i-e1s48f5o.r MapReduce Frameworks" in IEEE
+having any word for another primitive case. Further program is [15] Sharma, M., Hasteer, N., Tuli, A., and Bansal, A. Investigating the validated for base case where only one word is present in input inclinations of research and practices in hadoop: A systematic review. In text file. WordCount program is then run on given text file as Confluence The Next Generation Information Technology Summit step 7 execution and result is recorded. Further given text file is (Confluence), 2014 5th International Conference- (2014), IEEE, pp.
+modified by adding known frequency of certain words. 227–231.
+Program ran on modified text file as step9 and output is [16] Camargo L. C., and Vergilio S. R. Mapreduce program testing: a validated for known frequency changes in added words. s(SysCteCmCa)t,i3c2nmdaIpnpteinrngatisotundaly.CoInnf erCehniclee aonf thCeoCmopmutpeurtaSticoine n(2c0e13S)o. ciety
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
+
+[17] Chen, Y., Ganapathi A., Griffith R., and Katz R. The case for evaluating mapreduce performance using workload suites. In Modeling, Analysis & Simulation of Computer and Telecommunication Systems (MASCOTS), 2011 IEEE 19th International
+[18] Gudipati, M., Rao, S., Mohan, N. D., and Gajja, N. K. Big data: Testing approach to overcome quality challenges. Big Data: Challenges and Opportunities (2013), 65–72.
+[19] J. Moran, C. de la Riva, and J. Tuya, “MRTree: Functional testing based on MapReduce’s execution behaviour,” in proceedings International Conference Future Internet Things Cloud, 2014, pp. 379–384.
+[20] Apache MRUnit. [Online]. Available: http://mrunit.apache.org.
+[21] JUnit. [Online]. Available: http://junit.org.
+[22] J. Mor´an, C. de la Riva, and J. Tuya, “Testing data transformations in MapReduce programs,” in Proc. 6th Int. Workshop Automat. Test Case Design, Selection Evaluation, 2015, pp. 20–25.
+[23] J. Mor´an, C. de la Riva, and J. Tuya, “Testing data transformations in MapReduce programs,” in proceedings. IEEE International Conference on Software Quality, Reliability and Security, 2017, pp. 73–80.
+[24] Christoph Csallner, Leonidas Fegaras y Chengkai Li. New Ideas Track: Testing MapReduce-Style Programs. Proceedings of the 19th ACM SIGSOFT symposium and the 13th European conference on Foundations of software engineering. Pages 504-507.
+[25] Zhu, H.: A formal interpretation of software testing as inductive inference. Software Testing, Verification and Reliability 6(1) (1996) 3– 31
+[26] Hazewinkel, Michiel, [1994], "Mathematical induction", Encyclopedia of Mathematics, Springer Science+Business Media B.V. / Kluwer Academic Publishers, ISBN 978-1-55608-010-4 ed. (2001) [Online] https://www.encyclopediaofmath.org/index.php/Mathematical_induction
+[27] Lecture “Verifying the Correctness of Programs” [Online] http://www.cs.cornell.edu/courses/cs312/2006sp/lectures/lec10.html
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+SCEECS 2020
+Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply.
diff --git a/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt b/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt
new file mode 100644
index 0000000..28e24e8
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt
@@ -0,0 +1,148 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using
+Framework Abstraction
+Qian Zhang Jiyuan Wang Muhammad Ali Gulzar
+University of California, Los Angeles University of California, Los Angeles Virginia Tech
+zhangqian@cs.ucla.edu wangjiyuan@g.ucla.edu gulzar@cs.vt.edu
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Rohan Padhye
+Carnegie Mellon University rohanpadhye@cmu.edu
+ABSTRACT
+As big data analytics become increasingly popular, data-intensive scalable computing (DISC) systems help address the scalability is- sue of handling large data. However, automated testing for such data-centric applications is challenging, because data is often in- complete, continuously evolving, and hard to know a priori. Fuzz testing has been proven to be highly effective in other domains such as security; however, it is nontrivial to apply such traditional fuzzing to big data analytics directly for three reasons: (1) the long latencyofDISCsystemsprohibitstheapplicabilityoffuzzing:naïve fuzzing would spend 98% of the time in setting up a test environ- ment; (2) conventional branch coverage is unlikely to scale to DISC applications because most binary code comes from the framework implementation such as Apache Spark; and (3) random bit or byte level mutations can hardly generate meaningful data, which fails
+to reveal real-world application bugs.
+We propose a novel coverage-guided fuzz testing tool for big data analytics, called BigFuzz. The key essence of our approach
+is that: (a) we focus on exercising application logic as opposed to increasingframeworkcodecoveragebyabstractingtheDISCframe- work using specifications. BigFuzz performs automated source to source transformations to construct an equivalent DISC application suitable for fast test generation, and (b) we design schema-aware data mutation operators based on our in-depth study of DISC ap- plication error types. BigFuzz speeds up the fuzzing time by 78 to 1477X compared to random fuzzing, improves application code coverage by 20% to 271%, and achieves 33% to 157% improvement in detecting application errors. When compared to the state of the
+art that uses symbolic execution to test big data analytics, BigFuzz is applicable to twice more programs and can find 81% more bugs.
+KEYWORDS
+fuzz testing, big data analytics, test generation
+Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).
+ASE ’20, September 21–25, 2020, Australia ©2020 Copyright held by the owner/author(s).ACM ISBN 978-1-4503-6768-4/20/09. https://doi.org/10.1145/3324884.3416641
+Miryung Kim
+University of California, Los Angeles miryung@cs.ucla.edu
+ACM Reference Format: QianZhang,JiyuanWang,MuhammadAliGulzar,RohanPadhye,andMiryung Kim. 2020. BigFuzz: Efficient Fuzz Testing for Data Analytics Using Frame- work Abstraction. In 35th IEEE/ACM International Conference on Automated Software Engineering (ASE ’20), September 21–25, 2020, Virtual Event, Aus- tralia. ACM, New York, NY, USA, 12 pages. https://doi.org/10.1145/3324884. 3416641
+1 INTRODUCTION
+Emerging technologies are producing much data and the impor- tanceofdata-centricapplicationscontinuestogrow.Data-intensive scalablecomputing(DISC)systems,suchasGoogle’sMapReduce[30], Apache Hadoop [1], and Apache Spark [2], have shown great promises to address the scalability challenge of big data analytics. Although DISC systems are becoming widely available to industry, DISC applications are difficult to test and debug. Data scientists of- ten test DISC applications in their local environment using sample data only. These applications are thus not tested thoroughly and may not be robust to bugs and failures in the production setting.
+The correctness of DISC applications depends on their ability
+to handle real-world data; however, data is inherently incomplete, continuously evolving, and hard to know a-prior. Motivated by the successes of systematic test generation tools [33,34,62], a few have been proposed for dataflow-based DISC applications [38, 45, 52]. For example, BigTest [38] uses symbolic execution to automati- cally enumerate different path conditions of a DISC application and generate concrete inputs using an SMT solver. However, its applica- bility is limited to the dataflow operators (e.g., map, reduce, join, etc.) where symbolic execution is supported, and limited by the path exploration capability of the underlying symbolic execution engine and an SMT solver. In other words, developing a robust test generation tool for DISC applications remains an open problem.
+In recent years, coverage-guided mutation-based fuzz testing has emerged as one of the most effective test generation techniques for large software systems [17, 49]. Such fuzz testing techniques are based on implicit assumptions that it takes a relatively short amount of time to repetitively run programs with different inputs and arbitrary byte level mutations are likely to yield reasonable inputs. In fact, most fuzzing techniques start from a seed input, generate new inputs iteratively by mutating the previous inputs, andaddnewinputstotheinputqueueiftheyexerciseanewbranch.
+* This research was done, while the third and fourth authors were graduate students at UCLA and UC Berkeley respectively.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+However, our experience tells us that fuzzing cannot be applied to big data analytics directly. First, the long latency nature of DISC systems prohibits the efficacy of traditional fuzzing. While tradi- tional fuzzing techniques assume thousands of invocations per second, for example, Apache Spark applications would need about 10 to 15 seconds to initialize the Spark context for each run—job scheduling, data partitioning, and serialization all contribute to increased latency. Second, low-level mutations (e.g., flipping a bit or byte) in existing naïve fuzzers can hardly explore corner cases that represent realistic application bugs. Lastly, grammar-aware fuzzers[35,43,70]existtoreducethetimerequiredforconstructing meaningful inputs. However, they require a user to provide gram- mar rules and, by definition, they do not produce inputs violating the user-provided grammar rules.
+In this paper, we lay the groundwork for embodying a coverage- guided, mutation-based fuzz testing approach for big data analytics. The key insight behind BigFuzz is that fuzz testing of DISC applica- tions can be made tractable by abstracting framework code and by analyzing application logic in tandem. Our key idea is to perform source-to-source transformation of a DISC application to a seman- tically equivalent, yet a framework-independent program that is more amenable to fuzzing.
+Based on the insight that a DISC application developer writes ap- plicationlogicintermsofuser-definedfunctionsandconnectsthem usingdataflowoperatorsintheDISCframework, BigFuzz focuseson exercising application logic as opposed to the DISC framework im- plementation. BigFuzz uses a two-level instrumentation method to monitor application-specific coverage, while modeling the different outcomes of dataflow operations. As such combination of behav- ior modeling is independent of the underlying DISC framework implementation, we can abstract the framework with executable specificationsandgenerateaSparkcontextfreeprogramtomitigate the long latency caused by the DISC framework. An application de- veloper is not required to write any custom specifications, because the specifications for dataflow operators such as mapand reduce do not need to be re-written for each application. BigFuzz fully automates this process of constructing a semantically equivalent DISC application through source to source transformation.
+As opposed to random bit or byte-level input mutations, we de- sign schema-aware mutation operations guided by real-world error types. These mutation operations increase the chance of creating meaningful inputs that map to real-world errors. To inform the design of such data mutation operators, we conducted a systematic study on common error types and root causes in Apache Spark and Hadoop applications using two complementary sources: Stack Overflow[3]andGithub[4].Thestudyidentifiedtencommonerror types, which we map and encode in terms of six different mutation operators in BigFuzz.
+We evaluate BigFuzz on a benchmark of twelve Apache Spark ap- plications. We comparethe time togenerate test inputsand theiras- sociated error-finding capabilities against two baseline techniques: random fuzzing, and symbolic-execution based testing. With frame- work abstraction, BigFuzz is able to speed up the fuzzing time by 78 to 1477X compared to random fuzzing. Schema-aware mutation operations can improve application code coverage by 20 to 200% with valid inputs as seeds, which leads to 33 to 100% improvement in detecting application errors, when compared to naive random
+fuzzing. Even without valid input seeds, BigFuzz improves applica- tioncodecoverageby118to271%anderrordetectionby58to157%, demonstrating its robustness. We show that BigFuzz is applicable to twice more applications and can find 81% more bugs than the state of the art, BigTest.
+In summary, this work makes the following contributions:
+(1) We propose a fuzz testing technique called BigFuzz that targets DISC applications by automatically abstracting the dataflow behavior of the DISC framework with executable specifications. This novel approach can also be generalized to other systems with long latency.
+(2) We propose an automated instrumentation method to moni- tor application logic in conjunction with how dataflow op- erators are exercised in terms of their dataflow equivalence class coverage.
+(3) Wepresentschema-awaremutationoperationsthatareguided by real-world errors encountered in DISC applications. To our knowledge, we are the first to design a fuzz testing tech- nique by empirically studying and codifying mutations that correspond to real-world DISC bugs.
+(4) Our experimental evaluation on 12 Apache Spark applica- tions demonstrates that BigFuzz outperforms prior work in terms of code coverage and error-detection capability.
+We provide access to artifacts of BigFuzz at https://github.com/ qianzhanghk/BigFuzz.
+2 BACKGROUND
+Apache Spark. BigFuzz targets Apache Spark, a widely used data intensive scalable computing system but can generalize to other DISC frameworks. Spark achieves scalability by creating Resilient Distributed Datasets (RDDs), an abstraction of distributed collec- tion[73].ProgrammerscantransformRDDsinparallelusingdataflow operations, e.g.,val newRDD = RDD.map(s => s.length).Dataflow operators such as filter, map, and reduce are implemented as higher-order functions that take a user-defined function (UDF) as an input argument. The actual evaluation of an RDD occurs when an action such as count or collect is called. For example, a Spark application developer writes application logic in terms of UDFs and connects them using dataflow APIs. To execute the program, Spark first translates a program into a Directed Acyclic Graph (DAG), where vertices represent various operations on the RDDs, and then executes each stage in a topological order.
+Thecommonindustrypracticefortestingsuchbigdataanalytics applications remains running them locally on a randomly sampled dataset.Testingwithsampledataisoftenincompletewhichleadsto rare buggy cases in production runs. Often Spark programs run for days and then crash without an obvious reason. Additionally, the start up latency associated with invoking the Spark frameworkand Block Manager Mastercan take several seconds for simply setting up an execution environment and repetitive data partitioning, job scheduling, serialization, and deserialization to support distributed execution all contribute to increased latency. Thus random fuzzing would be prohibitively expensive to test big data analytics.
+Fuzz Testing. Fuzz testing such as AFL [17] has been proven to be highly effective in synthesizing test inputs that achieve high code coverage and find bugs. Given an input program, it instruments
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+Figure 1: Approach Overview of BigFuzz
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+1 val loan = sc.textFile("account_history.csv")
+2 // Input with zipcode, base loan, years, and rate
+3 .map{ line => val cols = line.split(",") 4 (cols(0),cols(1).toFloat,
+5 cols(2).toInt,cols(3).toFloat) }
+6 //Return zipcode, base loan, years, and rate
+7 . map{ s =>
+8 val a = s._2
+9 for(i <- 1 to s._3)
+10 a = a * (1 + s._4)
+11 (s._1, a) }
+12 // Return zipcode and final loan
+13 val locations = sc.textFile("zipcode.csv")
+14 //input with zipcode and city
+. map{ s =>
+1516➊ val cols = s.split(",")
+17 (cols(0), cols(1) }
+18 //Return zipcode and city
+19 .filter{ s => s._2 == "New York" }
+20 val output = loan.join(locations)
+21 . map{ s =>
+22 if(s._2._1 > 10000) ("Property Loan",10000) 23 else if(s._2._1 > 1000) ("Car Loan",1)
+24 else ("Credit Debt",1) }
+25 //Return three categories based on the loan amount 26 .reduceByKey( _+_ )
+1 ArrayList results0 = LoanSpec.read(inputFile1);
+2 ArrayList results1 = LoanSpec.map1 (results0);
+3 ArrayList results2 = LoanSpec.map2 (results1);
+4 ArrayList results3 = LoanSpec.read(inputFile2);
+5 ArrayList results4 = LoanSpec.map3 (results3);
+6 ArrayList results5 = LoanSpec.filter1 (results4); ➊
+7 ArrayList results6 = LoanSpec.join1(results5, results2);
+8 ArrayList results7 = LoanSpec.map4 (results6)
+9 ArrayList results8 = LoanSpec.reduceByKey1 (results7)
+(b) A transformed program LoanType.java with executable specifications
+1 public ArrayList map3(ArrayList input){
+2 ArrayList output = new ArrayList<>(); ➊ 3 for (String item: input){
+4 output.add( Map3.apply(item) );}
+5 return output;}
+(c) Specification implementation of map3in LoanTypeSpec.java
+1 public class Map3 {
+2 static final Map3 apply(String line2) {
+3 String cols[]=line2.split(",");
+4 return new Map3(cols[0],cols[1]); }
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+(a) A DISC application LoanType.scala (d) The extracted UDF from lines 14 to 16 of Figure 2a is represented as Map3.java
+Figure 2: Example code transformation and framework abstraction
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+the program’s bytecode, iteratively generates new inputs by mu- tating several bits or bytes of the seed input, and collects coverage feedback by executing the instrumented program with new inputs. All inputs that exercise a new code branch are then be saved for further mutation. The implicit assumption underlying such itera- tive fuzzing is that the target program can run fast, (i.e., thousands of invocations per second); unfortunately, this assumption is false for many long latency applications such as big data analytics. For example, initializing the Spark context in local model to initiate a distributed data pipeline takes 19 seconds, which correspond to 98% of the total execution time with a typical testing input. The long latency prohibits the applicability of fuzzing for efficient test generation. Besides, naively monitoring branch coverage in DISC applications is unlikely to exercise application logic adequately, since most binary code comes from the DISC framework imple- mentation (e.g., roughly 700 KLOC for Apache Spark). Under this circumstance, naive attempt to increase code coverage may eventu- ally run out of memory. Furthermore, random byte-level mutations can hardly generate meaningful structured or semi-structured data to explore application logic effectively.
+3 APPROACH
+BigFuzz contains three components that work in concert to make coverage-guided fuzz testing tractable for big data analytics. Fig- ure 1 shows (A) abstraction of dataflow implementation using source-to-source transformation with extracted user-defined func- tions, discussed in Section 3.1, (B) two-level instrumentation for coverage monitoring, discussed in Section 3.2), and (C) input muta- tionsgearedtowardsbigdataanalyticerrorsbasedonourempirical study,discussedinSection3.3.Thisapproachisbasedontheinsight that(1)wecanreducelonglatencyofDISCapplicationsbyabstract- ingdataflowimplementationinaDISCframeworkusingexecutable specifications and (2) we can focus on exercising application logic rather than the entire framework by monitoring code coverage of user-defined functions in tandem with equivalence classes of ab- stracted dataflow behavior. Although BigFuzz is designed for Spark programs, its key idea can generalize to other DISC frameworks such as Hadoop by rewriting the dataflow operator APIs to our current set of corresponding specification implementation.
+3.1 Framework Abstraction for Fuzzing
+As discussed in Section 2, DISC applications have high latency, making them not suitable for traditional fuzz testing because they
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+Table 1: Dataflow Operator and Corresponding Equivalence Classes
+
+Spark Dataflow Operator
Transformed Operator
Equivalences Classes
def filter(udf:T→ Boolean): RDD[T]
+Return an RDDthat satisfies a predicate udf:T→Boolean
ArrayList filter (ArrayList Input)
+Return an ArrayList of elements passing udf where udf:T → Booleean is implemented in filter
F1: Non-Terminating: ∃t.udf (t) = true
F2: Terminating: ∃t.udf (t) = f alse
def join[W](other: RDD[(K,W)]):Rdd[(K,(V,W))] Return an RDDcontaining all pairs of elements with matching keys in this and other RDDs.
ArrayList join (ArrayList L, ArrayList R) Return an ArrayList of elements from left ArrayList tL ∈L and right ArrayList tR ∈R, with matching keys tL,key = tR,key
J1: Non-Terminating: ∃tL,tR.tL,key = tR,key
J2: Terminating: ∃tL,∀tR.tL,key! = tR,key
J3: Terminating: ∃tR,∀tL.tR,key! = tL,key
def map[U](udf:T→U)
+Return a new RDD by applying udf:T→ U t of this RDD.
ArrayList map (ArrayList Input)
+Return a new ArrayList by applying a udf:T→ Uto this ArrayList where udf:T→ Uis implemented in map.
M: Non-Terminating: always non-terminated
def reduceByKey(udf:(V,V) → V) : RDD [K,V] Merge the values for each key using an associative reduce function.
ArrayList reduceByKey (ArrayList Input) Merge the values for each key using udf:(V,V) → V where udf:(V,V) → Vis implemented in reduceByKey
R: Non-Terminating: always non-terminated
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+spendseveralsecondsjusttoinitializeSpark’sexecutioncontextfor each run. Theoretically, the long start-up latency can be somewhat reduced by sharing one Spark execution environment for multiple runs;however,suchpracticeisstillnotenoughtoachievemillionsof executions per minute, because each run still needs to pass through
+a data partitioner, a query optimizer, a job scheduler, and a data serializer/deserializer, etc.
+In DISC frameworks, the implementation of dataflow and rela- tional operators is influenced by and universally agreed upon the semantics of such operators [68]. For example, although a dataflow operator join may have a specialized physical implementation in each framework (e.g., hash join), it has the same consistent logical semantics across all DISC frameworks. BigFuzz takes advantage of this observation, rewrites a DISC application into an equivalent applicationthatusesdataflowspecifications,andmonitorsdifferent equivalence class coverage of dataflow operations. For example, filter has two equivalence classes—one passing the filter predi- cate and the other not passing the filter. Because dataflow operators are deterministic and state-less [72], the transformed program is guaranteed to be equivalent to the original program. For example, map{x => (x,1)} will always give the same output for the same input for both the spec-based program and the original program.
+We map each dataflow operator’s implementation to a corre- sponding simplified yet semantically-equivalent implementation, which we call executable specifications. Such specifications help eliminate the dependency on the framework’s code, transforming
+a DISC application into an equivalent, simplified Java program that can be invoked numerous times in a fuzzing loop.
+BigFuzz automates this process of rewriting in two steps: (1) UDF extraction and (2) source to source transformation. Figure 2 illus- tratesthisprocessusinganexampleDISCapplicationthatidentifies thefrequencyofeachloantypewithinametropolitanarea.Thispro- gram is a variation of one of the DISC Benchmark [38]. We formu- lateadistributed,RDD-basedimplementationusingSpark’sAPIs(➊ in Figure 2a) to a simplified, executable specification of mapin Fig- ure 2c. Table 1 shows a few sample mappings between Spark RDD’s dataflow implementation APIs, equivalent spec-implementations using ArrayList, and a set of corresponding equivalence classes for each dataflow operator.
+Step 1. User-Defined Function (UDF) Extraction. To re-write a DISC application to use executable specifications only, BigFuzz de- composes the application into two components: (1) a direct acyclic graph (DAG) of dataflow operators and (2) a list of corresponding UDFs. Internally, BigFuzz decompiles the bytecode of the original
+application into Java source code and traverses Abstract Syntax Tree(AST)tosearchforamethodinvocationcorrespondingtoeach dataflow operator. The input arguments of such method invoca- tions represent the UDFs, which are stored as separate Java classes as shown in Figure 2d.
+Step2.SourcetoSourceTransformation. BigFuzz usestheDAG extracted in the previous step to reconstruct the DISC application in the same, interconnected dataflow order using executable specifi- cations. Such dataflow spec implementation takes in an ArrayList object as input, applies the corresponding UDF on each element of the input list, and returns an output ArrayList. For example, class LoanSpec.map3 (➊ in Figure 2b) represents the equivalent spec implementation using ArrayList that corresponds to map
+• in Figure 2a. It takes in results3 from its upstream opera- tors and returns an ArrayList result4 for downstream operator, LoanSpec.filter1. BigFuzz selects the corresponding UDFs from
+the list of UDFs extracted from step 1 and weaves them with the equivalent specifications shown in column 2 of Table 1. For exam- ple, Java classMap3has method apply mapping to the original UDF
+• in Figure 2a, and this method is invoked on each element of the input list as seen in Figure 2c.
+The above rewriting from a Spark application in Scala or Java to an equivalent Java application reduces the latency of running a DISC application, while retaining the same semantics. It also makes it easier to collect guidance metrics such as branch coverage by leveraging existing tools JQF [55], which takes Java bytecode as input and collects various guidance metrics for fuzz testing.
+3.2 Application Specific Coverage Guidance
+Priorworkfindsthatbranchcoverageisaneffectiveguidancemech- anism for feedback-guided fuzz testing, pushing test generation towards hard-to-reach corners [17, 44, 56]. Generally, feedback- guided fuzzing techniques instrument a program’s bytecode to label each constituent branch and if an input exercises a previously- unseen branch of the program, this input is appended in an input queue and the branch coverage is fed back into the fuzzer.
+However, we observe that such branch coverage guidance mech- anism cannot be applied to fuzz testing of big data analytics for two reasons. First, it cannot differentiate user-defined functions from framework code and can thus push test generation naively toward exploring the internals of DISC framework, as opposed to applica- tion logic. Second, it cannot effectively monitor different equiva- lence classes of dataflow operators though prior studies [38,45,52] argue that numerous errors originate from untested equivalence
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia
+Table 2: Data Collection for Error Type Study.
+and thus individual data records stop at this filter. BigFuzz in- struments “TraceLogger.get().emit(new FilterEvent(arm))” in specification implementation of filter to emit FilterEvent with a specific arm to the trace logger. In this way, BigFuzz retains the DISC framework’s behavior on the original application code, while abstracting its coverage guidance mechanism to the level of equivalence classes for individual dataflow operator uses. Coverage Guidance for User-Defined Function. DISC applica- tiondeveloperwritesapplicationlogicintermsofuser-definedfunc- tions (UDFs) and connects them using dataflow operators. These UDFs are standard library based Scala or Java implementations. To restrict normal coverage guidance to the body of UDFs (e.g., Figure2d),BigFuzz usesaselectiveinstrumentationschemeinASM, while ignoring all other dependent libraries. This combination of monitoring dataflow equivalence coverage together with control flow events in the body of UDFs constitutes the joint dataflow and user-defined function path coverage (JDU path coverage), which essentially represents the behavior of application logic.
+
Keyword
Total
Inspected
StackOverflow-Spark
apache spark exception
2430
top 150
apache spark error
3780
top 200
apache spark wrong/ unexpected/inconsistent result/output
143
143
StackOverflow-Hadoop
hadoop exceptions
2567
top 100
hadoop error
9585
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt b/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt
new file mode 100644
index 0000000..4b9e412
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt
@@ -0,0 +1,108 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Annals of Emerging Technologies in Computing (AETiC)
+Vol. 4, No. 3, 2020
+Research Article
+Failure Mode & Effect Analysis and another Methodology for Improving Data Veracity and Validity
+Ana Elsa Hinojosa Herrera*, Chris Walshaw and Chris Bailey
+School of Computing & Mathematical Sciences, University of Greenwich, UK
+aehinojosa@ieee.org; C.Walshaw@greenwich.ac.uk; C.Bailey@greenwich.ac.uk *Correspondence: aehinojosa@ieee.org
+Received: 29th April 2020; Accepted: 1st June 2020; Published: 1st July 2020
+Abstract: Failure Mode & Effect Analysis (FMEA) is a method that has been used to improve reliability of products, processes, designs, and software for different applications. In this paper we extend its usage for data veracity and validity improvement in the context of big data analysis and discuss its application in an electronics manufacturing test procedure which consists of a sequence of tests. Finally, we describe another methodology, developed as a result of the DVV-FMEA application which is aimed at improving the tests' repeatability and failure detection capabilities as well as monitoring their reliability.
+Keywords: Big Data; Data Veracity; Data Validity; FMEA; Statistics; Electronics Manufacturing; Quality Assurance; Test Limits Optimisation
+1. Introduction
+The market of data analytics was valued at USD 904.65 million in 2019 and is expected to reach USD 4.55 billion by 2025 [1]. Moreover, the use of data driven techniques is popular in smart manufacturing. Cost reduction can be achieved by mining data for predicting the quality of a batch, improving robustness of processes, or by reducing the process cycle time, for example.
+With regards the definition of big data, the authors in [2] describe it using 1C for complexity and 11Vs for: Volume, Velocity, Variety, Volatility, Virtual, Visibility, Vendee, Vase, Value, Veracity, and Validity. In this paper we cover the last 2 Vs of the list.
+Failure Mode and Effect Analysis (FMEA) is a method that has been used to improve reliability, testability and safety of hardware designs, processes, products, and software, for example [3-6]. In electronics, hardware (HW) FMEA has been used to improve electronics reliability [4], and in [7] software (SW) FMEA was used to validate embedded real time systems.
+In this paper we extend the usage of the FMEA method to improve data veracity and validity. The proposed extension (DVV-FMEA) is illustrated with an electronics manufacturing application for quality assurance. From using DVV-FMEA in this application a novel methodology was motivated for evaluating, improving and monitoring the definition of production tests.
+This article is organized as follows. Section 2 introduces the data veracity and validity concepts and main causes that commonly affect data quality. Section 3 discusses the usage of FMEA for data improvement and its application in production testing data. Sections 4 and 5 present the methodology for test definition evaluation, improvement, and monitoring, in addition to its application in a production test dataset, respectively. And finally, Section 6 concludes the article and states future work.
+Ana Elsa Hinojosa Herrera, Chris Walshaw and Chris Bailey, “Failure Mode & Effect Analysis and another Methodology for Improving Data Veracity and Validity”, Annals of Emerging Technologies in Computing (AETiC), Print ISSN: 2516-0281, Online ISSN: 2516-029X, pp. 9-16, Vol. 4, No. 3, 1st July 2020, Published by International Association of Educators and Researchers (IAER), DOI: 10.33166/AETiC.2020.03.002, Available: http://aetic.theiaer.org/archive/v4/v4n3/p2.html.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+AETiC 2020, Vol. 4, No. 3 15
+2. Data Veracity and Validity
+Poor data veracity and validity improvement is relevant for big data applications, because low quality data could generate inaccurate models and unreliable information, resulting in incorrect data- driven decision taking. In this section we discuss the characteristics of data veracity and validity.
+2.1. Data Veracity
+Data veracity is the ability to understand the data and the analytical process applied to a dataset. It covers aspects related to confidence in the dataset or data source, for example data integrity, availability, completeness, consistency, and accuracy and in addition, transparency and clarity in the processes used to generate, improve and analyse the dataset [2, 8, 9]. Authors in [10] discuss a general list of causes that frequently affect data veracity:
+· Measurement system limits: For example, equipment calibration, human errors, and non- standard measurement processes.
+· Limits of features extraction: This could be evaluated by measuring the precision of correctness and completeness.
+· Data integration limits: In real applications it is useful to gather and combine information from different sources, but sometimes it is challenging due to the diversity of data sources or formats.
+· Data ambiguity and uncertainty: In addition to the uncertainty due to data integration there are other sources of data ambiguity, for example ambiguities of natural language, uncertainty related to the information source and low relevance of the information with respect to other available information [11].
+· Data falsification and source collusion: In [12] authors model data falsification attack as a constrained optimization problem with two parameters: efficacy and covertness of the attack. The first parameter is related to the degradation in the detection performance, and the second one is the probability that the attacker will not be detected. In the formulation, the attacker would maximize the attack efficacy while controlling its exposure to the defence mechanism.
+2.2. Data Validity
+Data validity refers to data worthiness, which may change over time and during the process under study. For example, data generated before relevant changes in the process is not valid to generate models of the current state [2].
+The authors in [13] discussed data staleness for information systems where data is frequently updated. This data freshness characteristic is relevant, for example, in data streaming applications where information quickly becomes obsolete.
+3. Data Veracity and Validity Failure Mode and Effect Analysis
+In Section 2 we discussed the importance of veracity and validity. In addition, we noted its impact on data-based decision-making success. In this section we are going to present the DVV- FMEA steps to follow for improving these two elements of the big data definition, and the results of its usage in an electronics manufacturing quality assurance application.
+3.1. Steps of DVV-FMEA
+The DVV-FMEA is like HW FMEA, although with differences in System Identification, List of Failure Mode, Causes Identification, and Effect Analysis steps. The details as follows:
+Step 1. System Identification: In data-driven analysis, it is common that the modules identified in the process before using datasets for analysis consist of data generation, data storage, data gathering, and data pre-processing. Nevertheless, in some applications where data is streaming the storage module could be different.
+As in SW FMEA, the variables or features in the dataset must be listed for its evaluation. When working on big datasets which comprise a big quantity of variables, it seems sensible to group them based on engineering feature or data processes similarities.
+Step 2. List of Failure Modes Generation: It make sense to split the meeting time into the different modules and generate a failure modes list for each of these. The brain-storming meeting(s) should include team members with know-how and expertise in the data process and application.
+Step 3. Causes Identification: List the causes of failure modes and score them by its occurrence. We recommend including causes related to measurement system limits, features extraction limits, data integration limits, data ambiguity and uncertainty, data falsification and source collusion, data staleness. Ishikawa diagram is a useful tool which could be used as a guidance for causes identification. In Fig. 1 is the version we propose for causes identification in DVV-FMEA. It could be used for each failure mode identified in Step 2.
+
+Figure 1. Ishikawa Diagram for DVV Failure Modes Causes
+Step 4. Effect Analysis: In this step the effects of the failures are listed, and each of the effects is scored by its severity. It makes sense to include impacts to confidence in the dataset or data source, data integrity, data availability, data completeness, data consistency, data model, or analysis accuracy, execution time or efficiency, ability to replicate results or analysis, and data worthiness.
+As a guidance during the meeting, the DVV-FMEA leader could ask if and how each of the impacts listed above impacts the failure mode and fill it in the DVV-FMEA table.
+The following steps are the same as in HW FMEA.
+Step 5. Detection mechanism identification: A list with the available mechanisms that helps detecting the failure modes is generated. Each failure mode should have a score of its detectability.
+Step 6. Failure mode prioritization: In order to improve the efficiency of this method, the list of failure modes should be filtered based on the Risk Priority Number (RPN), which is calculated as in:
+Equation 1. Risk Priority Number
+= × ×
+Step 7. Process or Product Improvement: Based on the prioritization and resources available, the next step is to generate and execute an improvement plan, which contains actions to improve the data veracity and validity. These changes should reduce the score of severity, occurrence, or detection. It seems likely that severity score is less frequently reduced.
+3.2. Severity, Occurrence, and Detection Scales
+For the scaling it makes sense to use simple scales for severity, occurrence, and detection scores. For example, a 5 levels measure such as the Likert scale, which is easy to use. In Table 1 is detailed the ranking scale we recommend. Whenever historical data or a previous DVV-FMEA is available, it could be used to quantify the severity, likelihood, or detectability rates.
+Table 1. Occurrence, Severity, and Detection Ranking Scale
+Ranking
Occurrence
Severity
Detection
1
No known failures
Very low or none
Almost certain detection
3
Isolated failures
Low or minor
Remote chance of detection
5
Occasional failures
Moderate or significant
Moderate chance of detection
7
High rate of failure
High
High chance of detection
10
Failure is almost inevitable
Very high or catastrophic
Cannot be detected
3.3. DVV-FMEA Application in Production Testing
+In this subsection we include DVV-FMEA usage to establish the pre-processing step of the data analysis of an electronics manufacturing application. Experts in the manufacturing and data processes were part of the team that generated the DVV-FMEA table.
+In this application the input variables are the result of individual tests in a sequence that runs in a stop-on-fail scenario. For some tests in the sequence, a feature is measured and then compared to upper, lower or both limits to classify faulty devices. More details of the application and intermediate steps of the DVV-FMEA can be found in [14].
+As a result of using the DVV-FMEA, and based on the RPN, the list of +60 failure modes related to data validity and veracity was reduced to 14. Some of them are included in Table 2. Most of the improvements comprise R scripts that pre-process data before its usage for analysis. The scripts detect incorrect data and eliminate it, correct formats, and standardize data pre-processing steps to ensure repeatability, consistency, efficiency, and confidence.
+Table 2. DVV-FMEA for an Electronic Manufacturing Application
+System Module
Input
Failure Mode
RPN
Data Generation
Overall result
The overall result is not consistent
490
Data Generation
Text File
The file format is not correct
100
Data Generation
Test: 90, 480
The test was unsuccessful to detect faulty devices
150
Data Generation
Test type
Different to test sequence ‘p’
50
Data Generation
Dataset
Data does not represent the current process conditions
250
Data Pre-processing
Data order
The data is not ordered by date-time
70
Data Pre-processing
Clean dataset
No clarity on how the data was processed before using it for analysis
49
Data Pre-processing
Test/Training datasets
The sampling is not repeatable
70
The failure mode that has the highest priority is that the overall test result is not consistent, impacting the effectiveness of the test but also its efficiency because extra analysis is performed to ensure the good quality of the devices. The definition of the limits is relevant not only to the accuracy of the tests and the overall result, but also to its efficiency, because in the application one faulty characteristic of the device could be detected by more than one test in the sequence, but the earlier the fault is detected, the shorter the length of the test procedure. In Section 4 we present a methodology proposed to improve the definition of the tests. It was automated using a Python script implemented in a Jupiter notebook.
+Another failure mode with high priority is to avoid using out-of-date data for data analysis because the model would not be useful for the current state. This failure mode is relevant because in real applications it is very common that the processes change over time, for instance using new raw materials, updates to the design, or improvements to the manufacturing procedures. The methodology in Section 4 includes a monitoring phase which could be used for data analytics reliability as well.
+4. Test Limits Evaluation, Improvement and Monitoring Methodology
+The tests limits evaluation and improvement process we propose consists of four main phases: Test Efficiency Evaluation, Test Utility to Improve another Test Evaluation, Re-Define Test Limits, and Limits Monitoring.
+4.1. Phase 1: Test Efficiency Evaluation
+In this phase the aim is to evaluate each test in the sequence, comparing the data distribution versus test limits for FS-PTx, PS, and FTx samples.
+Step 1. Select a Test_x in the Sequence: The earlier in the sequence the better because potentially there is more improvement when finding a fail early in the sequence.
+Step 2. Split the Dataset into FS-PTx, PS, FTx: Here FS-PTx contains data of assets that failed the test sequence but in another test different to Test_x, PS contains the data of assets that passed the test sequence, and FTx is the data of assets that fail Test_x.
+Step 3. Plot Histograms for FS-PTx, PS, FTx: In the histograms can be visualised how each of these datasets performs versus the Test_x limits, if there is a partition between the three datasets, and if the datasets correspond to the same distribution.
+Step 4. Calculate Statistics for FS-PTx, PS, FTx: Descriptive statistics are useful for understanding the datasets. It makes sense to include mean, standard deviation, quartiles, maximum and minimum.
+Step 5. Partition Evaluation: Quantify the distance between PS and FTx populations. We propose using the following formulas:
+Equation 2. Partition Evaluation around Lower Limit
+max(FTx ) + 2 ∗ np.std(PS 0.15 0.85 ) < Tx lower limit
+Equation 3. Partition Evaluation around Upper Limit
+min(FTx ) − 2 ∗ np.std(PS 0.15 0.85 ) < Tx upper limit
+Where FTxbelow ll = {y in FTx | y < Tx lower limit}, FTxabove ul = {y in FTx | y > Tx upper limit}, and PSbetween 0.15 and 0.85 quartiles = {y in PS | y > PS quartile 15% & y < PS quartile 85%}.
+Step 6. Is there a Partition Between PS and FS-PTx? Using results of Steps 3 to 5 of this phase, when the answer is positive, the recommendation is to add or update the limits for Test_x.
+Step 7. Are PS & FTx Clearly Separated? Using results of Steps 3 to 5 of this phase, when the answer is negative, the recommendation is to reconsider the limits for Test_x.
+Step 8. Is FTx Empty? If the data of FS-PTx, PS, FTx are a representative sample, it can be inferred that it is highly probable that Test_x is passed, as a result could be eliminated from the sequence, or reduced the frequency of its execution.
+4.2. Phase 2: Test Utility to Improve another Test Evaluation
+In this phase the aim is to identify relationships between tests and whether one test could be used to calculate the result of another one. The steps are as follows:
+Step 1. Select Test_y in the sequence: Here Test_y is another test in the sequence which is executed after Test_x.
+Step 2. Are both continuous variables? If Test_x and Test_y measurements are continuous values, calculate Pearson Correlation Coefficient to quantify its association. If the coefficient is > 0.9 or < -0.9 the conclusion is that both tests are highly associated.
+Step 3. Are both discrete variables? If Test_x and Test_y measurements are discrete values, execute a Chi-Square Test to quantify their association. If the p-value is < 0.05 the conclusion is that both tests are highly associated. When the test sequence is run on stop-to-fail scenario, this test cannot be performed, since the dataset contains “pass” and “fail” data for Test_y but only “pass” for Test_x.
+When associated Tests are found in Steps 2 and 3, sometimes the association between them could be used to estimate the value of Test_y instead of performing the reading. As a result, the test sequence potentially could be reduced.
+4.3. Phase 3: Re-Define a Test Limit
+In this phase, the results of previous phases are summarised and joined after solving possible conflicts, followed by the implementation and documentation of changes. The details as follows:
+Step 1. Improvements Summary: Summarise the recommendations from Phase 1 and 2.
+Step 2. Feasibility Evaluation: Evaluate if the new test limits are correct from customer and engineering point of view.
+Step 3. Conflict Evaluation: Also evaluate if the recommendations are not in conflict, otherwise evaluate which is the recommendation that generates more improvement.
+Step 4. Update Test Limits Definition: The automated test sequence should be updated with the new test limits definition. It is likely that this motivates a new software version, which may need to be certified as part of software quality processes.
+Step 5. Document Changes: We recommend that these changes and verifications to be documented on the DVV-FMEA to have all information related to data quality improvement in a single document.
+4.4. Phase 4: Limits Monitoring
+The objective of this phase is to continuously evaluate whether the new limits are valid, or a re- definition is needed.
+Step 1. Metrics Definition: It is relevant to select the most representative metrics to monitor, and it makes sense to choose only a few and to prefer the ones which are easy to measure.
+Step 2. Continuous Monitoring: We recommend using statistical process control charts to monitor the key metrics. To keep the manufacturing process as simple as possible, it makes sense to have a small list of key elements to monitor, and also to automate this step, and consider automated flags or warnings when the key elements are not in control.
+Step 3. Maintenance: Whenever any of the key monitored parameters are not in control it is time to revisit Phases 1 to 5 of this methodology.
+5. Test_80 Evaluation and Improvement
+In this subsection the methodology we proposed in previous section is illustrated using the Test_80, which is part of the test sequence analysed in the DVV-FMEA we included in Section 3.
+In Figure 2 the histograms of assets that passed the test and in Figure 3 the histogram of assets that failed the test. In both figures, the upper and lower limits of Test_80 are indicated in vertical lines.
+
+Figure 2. Histograms of Assets that Passed Test_80 Figure 3. Histogram of Assets that Failed Test_80
+Table 3. Statistics of Test_80 Samples
+Statistics
PS
FS-PT80
FT80
Count
171131
39846
368
Mean
2.090
2.089
1.694
Std
0.006
0.010
0.432
Min
2.057
1.996
-0.140
25%
2.085
2.085
1.470
50%
2.088
2.089
1.473
75%
2.097
2.096
1.949
Max
2.104
2.104
2.697
From the histograms we can note that FS-PT80, PS and FT80 populations are not clearly separated. They are close around Test_80's lower limit. In addition, most of the assets, which failed Test_80, are near its lower limit. The statistics in Table 3 are in line with this conclusion. Furthermore, the results of the partition evaluation recommend re-defining the Test_80 lower limit.
+Following with the methodology, every test in the sequence was evaluated as stated in Phase 2. We found that there is a linear relation between Test_80 and Test_220. Furthermore, all are faulty assets when Test_80 < 2.05 & Test_220 > 2.05. Also, when Test_220 < 1.95 (Fig. 4).
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+www.aetic.theiaer.org
diff --git a/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt b/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt
new file mode 100644
index 0000000..920dedb
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt
@@ -0,0 +1,186 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Chapter 21
+Software Quality in the Era of Big Data, IoT and Smart Cities
+Fatmah Yousef Assiri and Rashid Mehmood
+21.1 Introduction
+Software quality is the degree to which the software conforms to its requirements. General software quality attributes include testability, maintainability, efficiency, and reliability. One important aspect of software quality is software correctness, which concerns how well the program provides the required functionalities, as defined by its specifications, and can be achieved through software testing and debugging. Software testing is a dynamic process that executes the software under study using a set of test inputs to ensure its outputs meet the users’ expectations. If the software behavior fails to perform as expected, software debugging is performed, which involves checking the code to determine the cause of failures and fixing them.
+Software testing and debugging are time-consuming. Studies show that soft- ware debugging and testing form between 50 and 70% of the total development cycle [41]. Software testing involves comparing a set of test inputs and expected results to the actual software outputs. If the software outputs fail to match the expected ones, a fault is detected and the software must be checked for errors. Code is debugged to locate faults and fix them. As requirements change, the software is tested again to ensure that it continues to return the expected behavior, and additional tests are written to test any new requirements; however, writing new tests is not a trivial process.
+F. Y. Assiri ( )
+College of Computer Science and Engineering, University of Jeddah, Jeddah, Saudi Arabia e-mail: fyassiri@uj.edu.sa
+R. Mehmood
+High Performance Computing Center, King Abdulaziz University, Jeddah, Saudi Arabia e-mail: RMehmood@kau.edu.sa
+© Springer Nature Switzerland AG 2020 519
+R. Mehmood et al. (eds.), Smart Infrastructure and Applications, EAI/Springer Innovations in Communication and Computing, https://doi.org/10.1007/978-3-030-13705-2_21
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+21 Software Quality in the Era of Big Data, IoT and Smart Cities 521
+The complexity of software is on the rise with the developments of smart cities. Smart cities are driven by, or involve, integration of multiple city systems, such as transport and healthcare, with the aim to provide its citizens a high quality of life [76], see, e.g., [72] for motivations of smart cities and societies. Integrating multiple complex systems causes an increase in the complexity of the underlying software interactions and leads to a higher software complexity. This in turn makes the software quality a bigger challenge.
+Relatedly, big data and Internet of Things (IoT) are driving radical changes in smart cities designs, and hence, the software systems landscape. Big data “refers to the emerging technologies that are designed to extract value from data having four Vs characteristics; volume, variety, velocity and veracity [71].” The Internet of Things (IoT) becomes one of the key technological developments of our times that we are able to realize its full potential; it is expected to be a major producer of big data [5]. IoT is defined as “a global infrastructure for the information society, enabling advanced services by interconnecting (physical and virtual) things based on existing and evolving interoperable information and communication technologies [81].”
+Together, big data, IoT, smart cities, and other emerging complex applications have exacerbated the challenges of maintaining software quality. The big data produced by IoT and other sources is used in designing or operating various software machines and systems. Since the data is uncertain (i.e., the veracity characteristic), it could lead to inaccurate or faulty system behavior. For example, a computed tomography (CT) scan based on inaccurate machine behavior, or inaccurate data, may give a false positive result for cancer. A wearable device may analyze the data of a diabetic patient incorrectly, giving false negative results, leading to no insulin dose for a patient who actually needed a high dose of insulin. Automatic surgery machines, autonomous vehicles, and spaceships all are examples of critical software with high software and data quality requirements. Moreover, data is being used by organizations to develop strategies, policies, and operations; inaccurate data could lead to disastrous outcomes for these organizations and even for the whole national or global economy.
+The aim of this paper is to review the technologies related to software quality in the era of big data, IoT, and smart cities. We elaborate on software quality processes, software testing and debugging. Model checking is discussed with some thoughts on the role it could play in the big data era and the benefits it could gain from big data. The role of big data in software quality is explored. Conclusion is drawn to suggest future directions.
+The remainder of the paper is structured as follows. Section 21.2 discusses software quality, software testing and debugging. Section 21.3 discusses model checking. Section 21.4 introduces big data and reviews some related work. Sec- tion 21.5 presents a review of the work that applies data mining techniques to utilize available data to improve software quality. Section 21.6 concludes the paper.
+21.2 Software Quality
+Software quality is the degree to which the software conforms to a set of require- ments that meet the design specification and the users’ expectations. Quality can be viewed and evaluated from the aspects of function, structure, and process [26]. Functional quality concerns the conformance of the tasks to the users’ required functionalities, with few defects as possible. Structural quality relates to the quality of the written code and can be measured by code maintainability, testability, and understandability. Process quality relates to the development process such as meeting the delivery deadlines and budgets. These three aspects of software quality interleave and thus affect each other.
+Software testing and debugging are among the main activities in the development cycle that guarantee the quality of the developed software. Software testing is a validation process that is conducted to ensure that the software meets its specifications, and software debugging is the process of analyzing the code to locate errors that caused the software to fail and correcting them [41]. In Sects. 21.2.1 and 21.2.2, we explain the work that has been done in both areas.
+21.2.1 Software Testing
+Testing, which is among the main steps in the software development life cycle to ensure software quality, involves executing a set of input values and checking their outputs to validate that the software meets its requirements and intended usage[10]. Testing is a dynamic process performed by observing the software execution. If the resulting output differs from the expected results, a fault is detected. The process of finding these faults and correcting them is called debugging.
+Testing can be done at different levels depending on the phase that has been performed. Unit testing evaluates the software at the implementation phase and tests each unit separately. Units can be an individual element of the software such as a method or a class. System and integration testing are performed when the system is complete. System testing verifies that the whole system meets the design specifications, and integration testing checks that the subsystems (group of units) integrate correctly.
+Software testing is divided into black-box and white-box testing. Black-box test- ing examines the application functionalities without looking to internal structures. Black-box testing creates tests from the software requirements and specifications; one form of applying it is through the equivalence class partitioning in which the program behaves the same for each set of input values; each set is called a class. For example, the program should retain the same output values for all positive number, thus the set of positive number is considered a class, and the program should be tested with exactly one value of each class.
+White-box testing (also known as structural testing) is a method of testing software functionalities (internal structure), and it can be applied through unit and system testing. Tests performed by the software development team are called alpha testing, and those performed by the customer are called beta testing. Beta testing is also a form of black-box testing [79].
+Tests consist of a set of test cases. Each test case consists of input values and a test oracle, which compares the expected output with the actual output to determine whether a program has failed or not [20]. To overcome the problem of having no oracles or the time-consuming process of writing them [94], metamorphic testing was introduced [28, 97]. Metamorphic testing creates follow-up test cases from a set of initial test cases using metamorphic relations. For example, if the initial test evaluates the power function f(x) = ex and the value of x is (3), then e2 is equal to value (let’s assume its (8) ). Metamorphic testing creates another test case which is the value of a is (− 2), and the output is (1/8). The metamorphic relation (MR) is used to check the outputs of the two tests. In this case, MR is that output of first test case (8) + the output of the second test case (1/8) is equal to (1). If MR does not satisfy, a failure is detected.
+Mutation testing is an alternative testing approach which was designed to assess the quality of the test cases [35, 46]. Mutation testing creates a copy of the original program, called a mutant, with a seeded fault. The faults are a simple syntax change injected to the code [61, 80]. Tests are executed and the fault is detected if the output of the mutant is different from the output of the original program. Mutation testing computes a mutation adequacy score, which represents the number of detected faults over the total number of seeded faults. A higher score indicates a higher quality of the test sets. MuJava tool was developed to perform automated mutation testing by generating mutants and computing the adequacy score for a set of JUnit tests [62].
+Software testing is labor intensive; thus, to reduce the costs, many automation techniques were developed to automate the generation of test data and test ora- cles [22, 23, 36, 55, 74, 90].
+21.2.2 Software Debugging
+Software debugging is a diagnosis process for locating and fixing errors that cause software to fail. Fault localization (FL) techniques were introduced to locate statements in source code that are more likely to contain faults. FL computes a suspiciousness score for each statement, and the computed score indicates the probability that a statement contains a fault.
+Spectrum-based FL (SBFL) [1, 4, 18, 29, 32, 49, 86], which is a common FL approach, is a dynamic process that counts the number of passed and failed tests executed for each statement and computes a suspiciousness score for each statement. Statements executed during a failed run are considered to be more likely to contain faults and are thus assigned a higher suspiciousness score than other statements.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+21 Software Quality in the Era of Big Data, IoT and Smart Cities 523
+Table 21.1 The dynamic behavior of the faulty program gcd when executed against tests in T1, ..., T5. Sus. Score is the suspiciousness score computed using Tarantula
+
+Stmt
T1
T2
T3
T4
T5
gcd (int a, int b) {
+if(a < 0) //fault
+{ printf(“%g \n”, b);
+return 0 ; } while(b ! = 0)
+if(a > b)
+a = a − b ;
+else
+b = b − a ; printf(“%g \n”, a) ; return 0 ;
+}
x x
+x x x x
x x x x
+x x x
x
+x x
x x x x x x x
x
+x x
+x x
Stmt ID
+Sus. Score
+1 2 3 4 5 6 7 8 9 10
+1.00 0.00 0.00 0.50 0.57 0.00 0.57 0.57 0.00 0.00
+Many heuristics have been proposed to compute statement suspiciousness scores [1, 4, 48, 49, 77, 86].
+To illustrate how FL techniques order statements based on the likelihood they contain faults, we used the C program shown in Table 21.1 that is adapted from [47]. The program computes the Euclid’s greatest common divisor. This example used four passed tests: T1, T2, T3, and T4, and one failed test: T5. To compute the suspiciousness score, we applied the Tarantula heuristic (Eq. (21.1)). To reduce the time of performing this step, many tools have been developed to automate other parts of testing, such as the FL techniques [45, 47, 83].
+%FailedT ests(s)
+susp_T urantula(s) = (21.1)
+%PassedT ests(s) + %FailedT ests(s)
+The debugging process also involves fixing located faults. Although this was traditionally a manual process, automated program repair (APR) techniques were developed to automate the process [52, 53, 59, 63, 78]. APR techniques take a faulty program and conduct a set of repair tests to produce a repaired program. Figure 21.1 describes the overall structure of the APR techniques. The APR technique applies an FL technique to create a list of potentially faulty statement (LPFS) that is ordered based on their likelihood of containing fault, creates a copy of the original program with one inserted change called a variant, and validates the created variant to check whether or not the fault is fixed.
+To create the variants, a set of program modification operators (PMOs) are applied to change the code in the faulty statement generating the variant. PMOs are selected randomly or in order based on the applied search algorithm. Then, each variant is validated by executing it on a set of test cases, regression tests, or formal specifications. The variant is considered a potential repair or potential repaired program if it passes all the tests used in the process. The generated repair
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+524
+Fig. 21.1 Overall automated program repair (APR) technique adapted from [15]
+F. Y. Assiri and R. Mehmood
+
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+is considered a potential repair, rather than a validated repair, because it is a repair with respect to the selected set of tests used in the process of fixing the faults. The repair is only considered a valid repair when it passes a set of tests (often regression tests) that were not included in the repair process.
+Many researchers have contributed to improve the APR process and the quality of generate repairs. Debroy and Wong [33, 34] proposed using mutations through a brute-force search and an FL technique to automate fault fixing. Nguyen et al. [78] developed SemFix, which is a tool that locates faults using the Tarantula heuristic [49]. Then, symbolic execution and program synthesis were used to fix faults. Program syntheses are applied in a predefined order. Wei et al. [91] fix faults using Eiffel programs equipped with contracts, and Kim et al. [53] repaired faults by creating fix templates using 10 built-in patterns that were developed based on common patches written by humans. Weimer et al. [92] developed a weighting scheme to locate faults and applied an evolutionary algorithm to fix faults. APR techniques are also used to fix faults for executable software [25, 82]. Evolutionary computing and genetic programming have been adapted to repair faults in C software [38, 59, 92, 93], Java [12, 52], and Python [2], and to help satisfy non- functional requirements [13, 95].
+The state-of-the-art APR technique is GenProg tool, which uses genetic pro- gramming to modify a program until it finds a variant that passes all the repair test [38, 59, 92, 93]. GenProg was used to successfully fix the Microsoft Zune bug date error, which froze Microsoft devices in 2008 due to an infinite loop that occurred on the last day of a leap year [75]. However, repairs generated using GenProg were hard to read and it only performed potential repairs since they failed when they were executed on a set of regression tests. Assiri and Bieman [15–17] proposed using first-order mutations with a stochastic search algorithm to generate repairs that are similar to efficient ones written by humans.
+Even though debugging activities (locating and fixing faults) have been auto- mated to reduce debugging costs, there are many new challenges particularly with big data because it runs largely on parallel cloud computing platforms, making
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+21 Software Quality in the Era of Big Data, IoT and Smart Cities 535
+it error prone and inefficient. Researchers have developed debugging tools to overcome these problems.
+BigDebug is an interactive debugging tool that allows developers to set break- points to inspect program states during program execution [40]. BigDebug also provides guarded watchpoints, which return a set of records that satisfy a given condition. BigDebug, which provides backward and forward tracking and allows developers to fix faults and resume execution, improves the performance, avoids having to start the execution from the beginning, and reduces the locations should be checked for failures.
+Considerable research has developed debugging tools for distributed systems. However, these typically depend on the use of a single frontend that controls many backend debuggers, which slows the process when used for large-scale distributed systems. Mehmood et al. [70] improved the structure of debuggers to scale them to large systems. The proposed debugging tool follows a hierarchical approach by using intermediate backend servers for a limited number of processes (Fig. 21.2), which evaluate assertions on the connected processes and report violations. This method improves the FL and system overall traffic, making it a suitable approach for large-scale distributed systems.
+An alternative method for debugging a distributed system is to perform the debugging at higher-abstraction level than the unit level [21]. When performed at the system level, system behavior is translated into a set of events that are filtered to remove all events that are not of interest to the user. Event sequences are then clustered to create one single event that is used to identify the cause of failures in complex distributed systems. Event definition language (EDL) is used to define a set of events based on a combination of previously determined events. Events are compiled and interpreted to determine the cause of the failures.
+Fig. 21.2 PDB architecture adapted from [70]
+Debugging tools rely on setting breakpoints or sets of slices to check the software’s behavior. Thus, if the specified locations of the variables do not contain the cause of the errors, the tools will be unable to identify the faulty code. Andrew and Myers developed the Whyline tool [54], an interactive debugging tool that allows developers to ask questions for a given output. Whyline records execution traces for each event and each execution trace has a specific trace file. Then, an output history is created for all stored events. When a class is loaded, Whyline runs an algorithm that depends on data dependencies to identify all variables and fields affected by the output. After identifying the codes responsible for the specified output, the tool generates questions using static and dynamic methods. Two questions are asked: why did and why did not. The first question is answered using the dynamic slicing technique and the latter is answered by investigating each instruction individually. The evaluation study found that using Whyline improved the debugging time for novice programmers, but it suffers from performance issues.
+21.3 Model Checking
+Model checking is a verification method that is performed to ensure program correctness by investigating all possible software internal states. Model checking requires a complete and clear set of properties that describes what the system should and should not do. The software states are checked against the specified properties. If a violation is found, counterexamples to the execution paths that caused the violation are generated. Model checking has been used to debug many systems such as airline reservation and e-commerce systems [19].
+Model checking has also been used to automate software testing (see Callahan et al. [24]). White-box testing, which concerns the software’s internal representation through the investigation of execution traces for intermediate values, detects errors if an inconsistency exists between the actual and expected values. Specification- based testing, which uses model checking techniques, was proposed to validate and generate tests during the software evolutionary process. In this method, a computation tree comprising all possible execution paths is generated and searched to ensure that all paths follow the specified constraints.
+Even though the work by Callahan et al. [24] used a model checker to generate test cases automatically, Amman et al. [9, 11] proposed using a model checker to generate mutation-adequate test cases by adapting mutation testing. Model checking is used widely to write and validate specifications. The proposed combination of model checking and mutation testing addresses the limitation of automatic test generation and mutation testing at the system level. System specifications are converted into a format used by the model checker using a modeling tool. Then, the generated specifications are mutated and used by the model checker to create counterexamples, which are used to automatically generate test cases. Tests are executed and the results and coverage are reported.
+For test generation, the SPIN model checker [44] is used to identify execution trace paths for a specified property. Paths are validated and divided into partitions based on a defined set of requirements; each partition, which is called a coverage property, consists of a set of execution paths. Test templates, comprising actual test sequences, are generated using SPIN and are used to create invalid coverage properties to force the program to fail.
+Formal methods, such as software cost reduction (SCR), have been used to improve software quality. SCR reduces the development cost since it helps to detect violations at an early stage in the software life cycle before the implementation [39]. SCR uses requirements to generate test sequences that consist of a set of input values and a set of output values for each input. The input values are validated by checking the set of constraints that are specified through the requirement specifications. Then, the test sequences are divided into equivalent partitions and test inputs are generated for all partitions.
+Model checking relies on building models of the actual systems and then verifying the models, and therefore, big data technologies can be used to automate the process of model building. Big data technologies could also improve the quality of models that are built before being model checked. Alternatively, model checking can be applied to address the veracity challenges of big data.
+While model checking has been very successful in verifying real-life systems, its biggest hurdle is the state-space explosion problem. Researchers have developed various techniques to address this challenge. These include, among others, the use of high performance computing techniques, see, e.g., [66, 67, 69].
+21.4 Big Data
+Big data is a relatively new research area that has been utilized in many fields such as online retail stores, decision-making, and scientific research [27]. Big data is defined variously in the literature: some researchers define it using the 3Vs: volume, velocity, and variety [56]. Volume relates to the size of the data, velocity is the speed of the data stream, and variety refers to the data types. Other researchers define big data using 4Vs, with the forth V referring to value, variability, or virtual [98]. Fen and Befit defined big data as the 3Vs plus two more: variability (data interpretation) and value (making decisions) [37]. We consider the definition where volume, variety, velocity, and veracity are used as the 4Vs of big data [71], and consider veracity, as many have noted, to be the biggest challenge of big data.
+Big data applications can be used in business, technology, health, and smart cities. Big data can be used to improve quality of life. Data have been used in online retail stores, such as Amazon, to identify user preferences. Algorithms collect information about the users’ preferences based on their actions [65]. In addition, the amount of healthcare data is increasing and is expected to reach a zettabyte in the near future in the USA [85]. Using this medical data will benefit individuals’ health by enabling doctors to detect diseases at the early stages and determine treatments, recovery options, and risks. For additional works on big data in context of smart cities, see [6, 7, 14, 68, 73, 88].
+21.5 Big Data and Software Quality
+Data can be used as a validity tool to ensure software correctness, build rec- ommender systems, and predict future actions. Big data has been utilized in many sectors such as healthcare, banking, and transportation. Data are processed using data mining techniques to determine trends and to help in decision-making. Software quality can be related to big data in at least two ways. Firstly, big data can help develop better software quality techniques. Secondly, software quality techniques are needed to improve the quality of big data software and possibly deal with the big data veracity challenge.
+With respect to software quality, existing work has applied data mining tech- niques to analyze data repositories, fix faults, determine trends, and automate test generation.
+21.5.1 Mining Big Data
+Data mining is performed to analyze large amounts of data to understand trends in the data and support decision-making [42]. Software intelligence (SI) is a new field of mining software data to help practitioners in daily decision-making processes, such as when to release the system, what part of the system to test, and/or what part to change [43].
+Mining software repositories is a research direction that analyzes data repos- itories to obtain useful information about systems and projects. The types of repositories include historical repositories that show project progress; run-time repositories, which show system usage on deployment sites; and code repositories, which contain the code for software versions. Linking code repositories and bug repositories can provide a method for warning practitioners about bugs and risky codes.
+Lin and Ryaboy analyzed Twitter data using data mining tools; however, due to the limitations of existing tools, the analysis was not a straightforward process [60]. In [89], the researchers mined heterogeneous information using the semantics of node types and the links between them in the networks. The researchers in [51] studied the potential of mining big graphs and found the PEGASUS tool to be a promising approach since it finds anomalous in the large Twitter connected graphs. Last, the authors in [8] focused on mining a large stream of Netflix Prize data to personalize recommendations. To improve the probabilities of customers selections, a lot of factors and more data need to be considered.
+The authors in [50] used mining bug reports to develop the BugMiner tool, which uses the support vector machines (SVM) machine learning technique to perform a completion check and a redundancy check on new reports and estimate bug report trends (e.g., incident rate over time) of bug report databases using natural language processing. SVM used the historic reports to train the model to fill any missing fields. For any given report, the tool checks if it already exists by applying similarity ranking using cosine similarity, and Weibull distribution uses historicdata to estimate the number of bug reports received during a specified period (weeks or months) after the start of the project. The experimental results showed that BugMiner was effective in terms of bug reports completion, redundancy, and finding trends. The authors suggest combining the tool with other bug tracking tools to create advanced intelligent software.
+Mining software is also used to develop a repair model in the area of APR [64]. In their paper, the authors mine software repositories by investigating developers comments to generate repair actions that can be used later to fix faults. Repair actions can be in the form of adding a method call or changing the condition of if statements. Repair actions are then assigned different probabilities that are also learned from the repositories. To collect fixes from repositories, the authors used data set of 14 repositories and checked the differences between transitions at the abstract syntax tree (AST) level. A difference algorithm was used to produce the set of changes between each pair of Java files. The authors generated 41 change types and 137 possible change type entity types. The empirical study found that 28% of the changes were statement insertions, 23% were statement deletions, and 23% were statement updates. However, the change type statement insert was composed of many entity types, e.g., insert method invocation, if conditional, insert new variable. The results showed that the probability distribution of change type is project independent.
+To repair faults, the authors of [64] created a repair model and used different approaches to compute the probabilities of each repair action. The repair shape, which is a set of all possible combinations of repair actions, was then created. The search space is a combination of fault space, repair shapes, and the concrete repair actions that create the shape.
+In [96], the authors mined software repositories to study the co-evolution of the production code and test code. Repository histories and log messages were analyzed; however, the results found no matching between changes in the production code and the test. In other words, the test codes remained the same after changing the production code. The test coverage also dropped since no new test was created to guarantee the coverage of the new boundary values. Despite the notable finding, the study failed to specify which data mining techniques were used to check the repositories.
+Data mining algorithms are used to automatically induce missing functional requirements from data executions [58]. This approach can help to recover missing and incomplete specifications, design regression tests, and evaluate the correct- ness of software. Creating up-to-date regression tests is difficult, especially with legacy systems. One way to create regression tests is to identify the input–output relationships to write the requirements of the existing system. In [57], the authors proposed to identify the input–output relationships automatically using info-fuzzy networks (IFN), and they evaluated the effectiveness of IFN methodology on complex systems. The experimental results found that the data mining methods are effective for generating tests automatically without needing humans or complete sets of requirements since functional requirements are learned from data execution.
+This study compares two approaches of automated construction of oracle: artificial neural networks (ANNs) and IFNs [3]. ANNs have been used to generate a minimal set of tests that are effective at revealing faults [57, 87]. To generate oracles automatically, the following three steps are performed: (1) the training phase, where the system is given positive oracles; (2) the evaluation phase, which accepts positive oracles and rejects negative ones; and (3) the decision phase in which the trained oracles identify correct test cases from unlabeled ones. The experimental results found that IFN would be more appropriate for testing applications that are at the early stages. However, ANNs appear to be better at identifying hard-to-detect faults.
+Data mining techniques have been adapted to troubleshoot distributed sys- tems [30]. The goal of this approach is to identify which resources properties would succeed or fail for specific jobs. To demonstrate this approach, the job and machine features for 1000 jobs were extracted, and the job status was described as either a success or failure. Then, two data mining techniques were applied to generate a prediction model: C4.5 decision tree [84] and RIPPER rule-based classification algorithm [31]. Even though both methods predicted that the same features would cause the failures, RIPPER was found to be a more robust and promising method. While other data mining techniques, such as the lazy learning technique, can be applied, they tend to require more information before drawing the model. Additional research is needed to examine more internal or external features.
+21.6 Summary, Conclusions, and Future Work
+Software quality is the degree to which the software conforms to its requirements. General software quality attributes include testability, maintainability, efficiency, and reliability. One important aspect of software quality is software correctness, which concerns how well the program provides the required functionalities, as defined by its specifications, and can be achieved through software testing and debugging. The complexity of software is on the rise with the developments of smart cities due to the complex nature of these applications and environments. Big data and Internet of Things (IoT) are driving radical changes in the software systems landscape. Together, big data, IoT, smart cities, and other emerging complex applications have exacerbated the challenges of maintaining software quality.
+The big data produced by IoT and other sources is used in designing or operating various software machines and systems. Since the data is uncertain (i.e., the veracity characteristic), it could lead to inaccurate or faulty system behavior. In this paper, we reviewed the technologies related to software quality in the era of big data, IoT, and smart cities. We elaborated on software quality processes, software testing and debugging. Model checking was discussed with some directions on the role it could play in the big data era and the benefits it could gain from big data. The role of big data in software quality was explored.
+We discussed that software quality can be related to big data in at least two ways. Firstly, big data can help develop better software quality techniques. Secondly, software quality techniques are needed to improve the quality of big data software and possibly deal with the big data veracity challenge. We also highlighted that big data technologies can be used to automate the process of model building as part of the model checking process. Big data technologies could also improve the quality of models that are built before being model checked. Alternatively, model checking can be applied to address the veracity challenges of big data. As mentioned that the biggest hurdle of model checking is the state-space explosion problem that could be addressed using high performance computing techniques.
+Our future work will focus on bringing together cutting-edge software quality and big data techniques to develop novel techniques for improving software and data quality of smart city systems.
+References
+1. Abreu, R., Zoeteweij, P., Van Gemund, A.J.: On the accuracy of spectrum-based fault local- ization. In: Testing: Academic and Industrial Conference Practice and Research Techniques- MUTATION, 2007. TAICPART-MUTATION 2007, pp. 89–98. IEEE, Piscataway (2007)
+2. Ackling, T., Alexander, B., Grunert, I.: Evolving patches for software repair. In: Proceedings of the 13th Annual Conference on Genetic and Evolutionary Computation, GECCO ’11, pp. 1427–1434. ACM, New York (2011)
+3. Agarwal, D.: A comparative study of artificial neural networks and info fuzzy networks on their use in software testing. Master’s Thesis, University of South Florida (2004)
+4. Agrawal, H., Horgan, J.R., London, S., Wong, W.E.: Fault localization using execution slices and dataflow tests. In: Proceedings of the Sixth International Symposium on Software Reliability Engineering, pp. 143–151. IEEE, Piscataway (1995)
+5. Alam, F., Mehmood, R., Katib, I., Albeshri, A.: Analysis of eight data mining algo- rithms for smarter internet of things (IOT). Procedia Comput. Sci. 98, 437–442 (2016). https://doi.org/10.1016/j.procs.2016.09.068. http://www.sciencedirect.com/science/article/pii/ S187705091632213X. The 7th International Conference on Emerging Ubiquitous Systems and Pervasive Networks (EUSPN 2016)/The 6th International Conference on Current and Future Trends of Information and Communication Technologies in Healthcare (ICTH-2016)/Affiliated Workshops
+6. Alomari, E., Mehmood, R.: Analysis of Tweets in Arabic Language for Detection of Road Traffic Conditions, pp. 98–110. Springer, Cham (2018). https://doi.org/10.1007/978-3-319- 94180-6_12. http://link.springer.com/10.1007/978-3-319-94180-6_12
+7. Alotaibi, S., Mehmood, R.: Big Data Enabled Healthcare Supply Chain Management: Oppor- tunities and Challenges, pp. 207–215. Springer, Cham (2018). https://doi.org/10.1007/978-3- 319-94180-6_21. http://link.springer.com/10.1007/978-3-319-94180-6_21
+8. Amatriain, X.: Mining large streams of user data for personalized recommendations. ACM SIGKDD Explor. Newsl. 14(2), 37–48 (2013)
+9. Ammann, P.: System testing via mutation analysis of model checking specifications. ACM SIGSOFT Softw. Eng. Notes 25(1), 33 (2000)
+10. Ammann, P., Offutt, J.: Introduction to software testing, Cambridge University Press, Cam- bridge (2016)
+11. Ammann, P.E., Black, P.E., Majurski, W.: Using model checking to generate tests from specifications. In: Proceedings of Second International Conference on Formal Engineering Methods, pp. 46–54. IEEE, Piscataway (1998)
+12. Arcuri, A.: On the automation of fixing software bugs. In: Companion of the 30th International Conference on Software Engineering, ICSE Companion ’08, pp. 1003–1006. ACM, New York (2008)
+13. Arcuri, A., Yao, X.: A novel co-evolutionary approach to automatic software bug fixing. In: IEEE Congress on Evolutionary Computation, 2008. CEC 2008. (IEEE World Congress on Computational Intelligence), pp. 162–168. IEEE, Piscataway (2008)
+14. Arfat, Y., Mehmood, R., Albeshri, A.: Parallel Shortest Path Graph Computations of United States Road Network Data on Apache Spark, pp. 323–336. Springer, Cham (2018). https:// doi.org/10.1007/978-3-319-94180-6_30. http://link.springer.com/10.1007/978-3-319-94180- 6_30
+15. Assiri, F.Y., Bieman, J.M.: An assessment of the quality of automated program operator repair. In: Proceedings of the 2014 ICST Conference, ICST’14, IEEE, Piscataway (2014)
+16. Assiri, F.Y., Bieman, J.M.: The impact of search algorithms in automated program repair. Submitted to the 2015 International Conference on Soft Computing and Software Engineering, (SeSe’15) (2015)
+17. Assiri, F.Y., Bieman, J.M.: Fault localization for automated program repair: effectiveness, performance, repair correctness. Softw. Qual. J. 25(1), 171–199 (2017)
+18. Baah, G.K., Podgurski, A., Harrold, M.J.: The probabilistic program dependence graph and its application to fault diagnosis. IEEE Trans. Softw. Eng. 36(4), 528–545 (2010)
+19. Baier, C., Katoen, J.P.: Principles of model checking. MIT Press, Cambridge (2008)
+20. Baresi, L., Young, M.: Test oracles. Tech. Rep., Technical Report CIS-TR-01-02, University of Oregon, Dept. of Computer and Information Science, Eugene, Oregon (2001)
+21. Bates, P.C., Wileden, J.C.: High-level debugging of distributed systems: the behavioral abstraction approach. J. Syst. Softw. 3(4), 255–264 (1983)
+22. Boyapati, C., Khurshid, S., Marinov, D.: Korat: automated testing based on java predicates. In: ACM SIGSOFT Software Engineering Notes, vol. 27, pp. 123–133. ACM, New York (2002)
+23. Burdonov, I., Kossatchev, A., Petrenko, A., Galter, D.: Kvest: automated generation of test suites from formal specifications. In: International Symposium on Formal Methods, pp. 608– 621. Springer, Berlin (1999)
+24. Callahan, J., Schneider, F., Easterbrook, S., et al.: Automated software testing using model- checking. In: Proceedings 1996 SPIN workshop, vol. 353 (1996)
+25. Carzaniga, A., Gorla, A., Mattavelli, A., Perino, N., Pezze, M.: Automatic recovery from run- time failures. In: Proceedings of the 2013 International Conference on Software Engineering, pp. 782–791. IEEE, Piscataway (2013)
+26. Chappell, D.: The three aspects of software quality: functional, structural, and process, White Paper. Chappell & Associates, San Francisco, CA. Available at www.davidchappell.com. Last accessed 30 May 2019
+27. Chen, C.P., Zhang, C.Y.: Data-intensive applications, challenges, techniques and technologies: a survey on big data. Inf. Sci. 275, 314–347 (2014)
+28. Chen, T.Y., Cheung, S.C., Yiu, S.M.: Metamorphic testing: a new approach for generating next test cases. Tech. Rep., Technical Report HKUST-CS98-01, Department of Computer Science, Hong Kong University of Science and Technology, Hong Kong (1998)
+29. Chilimbi, T.M., Liblit, B., Mehra, K., Nori, A.V., Vaswani, K.: Holmes: effective statistical debugging via efficient path profiling. In: IEEE 31st International Conference on Software Engineering, 2009. ICSE 2009, pp. 34–44. IEEE, Piscataway (2009)
+30. Cieslak, D.A., Thain, D., Chawla, N.V.: Short paper: troubleshooting distributed systems via data mining. In: 15th IEEE International Symposium on High Performance Distributed Computing, pp. 309–312. IEEE, Piscataway (2006)
+31. Cohen, W.W.: Fast effective rule induction. In: Machine Learning Proceedings 1995, pp. 115– 123. Elsevier, Amsterdam (1995)
+32. Dallmeier, V., Lindig, C., Zeller, A.: Lightweight defect localization for Java. In: ECOOP 2005- Object-Oriented Programming, pp. 528–550. Springer, Berlin (2005)
+33. Debroy, V., Wong, W.E.: Using mutation to automatically suggest fixes for faulty programs. In: Third International Conference on Software Testing, Verification and Validation (ICST), pp. 65–74. IEEE, Piscataway (2010)
+34. Debroy, V., Wong, W.E.: Combining mutation and fault localization for automated program debugging. J. Syst. Softw. 90, 45–60 (2014)
+35. DeMillo, R.A., Lipton, R.J., Sayward, F.G.: Hints on test data selection: help for the practicing programmer. Computer 11(4), 34–41 (1978)
+36. Dick, J., Faivre, A.: Automating the generation and sequencing of test cases from model-based specifications. In: International Symposium of Formal Methods Europe, pp. 268–284. Springer, Berlin (1993)
+37. Fan, W., Bifet, A.: Mining big data: current status, and forecast to the future. ACM SIGKDD Explor. Newsl. 14(2), 1–5 (2013)
+38. Forrest, S., Nguyen, T., Weimer, W., Le Goues, C.: A genetic programming approach to automated software repair. In: Proceedings of the 11th Annual conference on Genetic and evolutionary computation, GECCO ’09, pp. 947–954. ACM, New York (2009)
+39. Gargantini, A., Heitmeyer, C.: Using model checking to generate tests from requirements specifications. In: ACM SIGSOFT Software Engineering Notes, vol. 24, pp. 146–162. Springer, Berlin (1999)
+40. Gulzar, M.A., Interlandi, M., Yoo, S., Tetali, S.D., Condie, T., Millstein, T., Kim, M.: Bigdebug: debugging primitives for interactive big data processing in spark. In: Proceedings of the 38th International Conference on Software Engineering, pp. 784–795. ACM, New York (2016)
+41. Hailpern, B., Santhanam, P.: Software debugging, testing, and verification. IBM Syst. J. 41(1), 4–12 (2002)
+42. Hand, D.J.: Principles of data mining. Drug Saf. 30(7), 621–622 (2007)
+43. Hassan, A.E., Xie, T.: Software intelligence: the future of mining software engineering data. In: Proceedings of the FSE/SDP Workshop on Future of Software Engineering Research, pp. 161– 166. ACM, New York (2010)
+44. Holzmann, G.J.: Design and Verification of Computer Protocols, Prentice Hall, Upper Saddle River (1991)
+45. Janssen, T., Abreu, R., van Gemund, A.J.: Zoltar: A toolset for automatic fault localization. In: Proceedings of the 2009 IEEE/ACM International Conference on Automated Software Engineering, pp. 662–664. IEEE Computer Society, Washington, D.C. (2009)
+46. Jia, Y., Harman, M.: An analysis and survey of the development of mutation testing. IEEE Trans. Softw. Eng. 37(5), 649–678 (2011)
+47. Jones, J.A., Harrold, M.J.: Empirical evaluation of the Tarantula automatic fault-localization technique. In: Proceedings of the 20th IEEE/ACM international Conference on Automated Software Engineering, pp. 273–282. ACM, New York (2005)
+48. Jones, J.A., Harrold, M.J., Stasko, J.T.: Visualization for fault localization. In: Proceedings of ICSE 2001 Workshop on Software Visualization, Toronto, Ontario, pp. 71–75. Citeseer (2001)
+49. Jones, J.A., Harrold, M.J., Stasko, J.: Visualization of test information to assist fault localization. In: Proceedings of the 24th International Conference on Software Engineering, pp. 467–477. ACM, New York (2002)
+50. Kaiser, L.W.B.X.G., Passonneau, R.: Bugminer: Software reliability analysis via data mining of bug reports. Delta 12(10), 09–0500 (2011)
+51. Kang, U., Faloutsos, C.: Big graph mining: algorithms and discoveries. ACM SIGKDD Explor. Newsl. 14(2), 29–36 (2013)
+52. Kern, C., Esparza, J.: Automatic error correction of Java programs. In: Proceedings of the 15th International Conference on Formal Methods for Industrial Critical Systems, FMICS’10, pp. 67–81. Springer, Berlin (2010)
+53. Kim, D., Nam, J., Song, J., Kim, S.: Automatic patch generation learned from human-written patches. In: Proceedings of the 2013 International Conference on Software Engineering, pp. 802–811. IEEE, Piscataway (2013)
+54. Ko, A.J., Myers, B.A.: Debugging reinvented: asking and answering why and why not questions about program behavior. In: Proceedings of the 30th International Conference on Software Engineering, pp. 301–310. ACM, New York (2008)
+55. Lamancha, B.P., Polo, M., Caivano, D., Piattini, M., Visaggio, G.: Automated generation of test oracles using a model-driven approach. Inf. Softw. Technol. 55(2), 301–319 (2013)
+56. Laney, D.: 3d data management: controlling data volume, velocity and variety. META Group Res. Note 6(70), 1 (2001)
+57. Last, M., Kandel, A.: Automated test reduction using an info-fuzzy network. In: Software Engineering with Computational Intelligence, pp. 235–258. Springer, Boston (2003)
+58. Last, M., Friedman, M., Kandel, A.: The data mining approach to automated software testing. In: Proceedings of the Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 388–396. ACM, New York (2003)
+59. Le Goues, C., Nguyen, T., Forrest, S., Weimer, W.: GenProg: a generic method for automatic software repair. IEEE Trans. Softw. Eng. 38(1), 54–72 (2012)
+60. Lin, J., Ryaboy, D.: Scaling big data mining infrastructure: the twitter experience. ACM SIGKDD Explor. Newsl. 14(2), 6–19 (2013)
+61. Ma, Y.S., Kwon, Y.R., Offutt, J.: Inter-class mutation operators for java. In: Proceedings of 13th International Symposium on Software Reliability Engineering, 2002. ISSRE 2003, pp. 352– 363. IEEE, Piscataway (2002)
+62. Ma, Y.S., Offutt, J., Kwon, Y.R.: Mujava: a mutation system for Java. In: Proceedings of the 28th International Conference on Software Engineering, pp. 827–830. ACM, New York (2006)
+63. Martinez, M., Monperrus, M.: Astor: evolutionary automatic software repair for Java. arXiv preprint arXiv:1410.6651 (2014)
+64. Martinez, M., Monperrus, M.: Mining software repair models for reasoning on the search space of automated program fixing. Empir. Softw. Eng. 20(1), 176–205 (2015)
+65. McAfee, A., Brynjolfsson, E., Davenport, T.H., Patil, D., Barton, D.: Big data: the management revolution. Harv. Bus. Rev. 90(10), 60–68 (2012)
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt b/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt
new file mode 100644
index 0000000..9ce75b9
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt
@@ -0,0 +1,115 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+SAT-ETL-Integrator: an extract- transform-load software for satellite big data ingestion
+Badr-Eddine Boudriki Semlali Chaker El Amrani Guadalupe Ortiz
+Badr-Eddine Boudriki Semlali, Chaker El Amrani, Guadalupe Ortiz, SAT-ETL-Integrator: an extract-transform-load software for satellite big data ingestion, J. Appl. Remote Sens.14(1), 018501 (2020), doi: 10.1117/1.JRS.14.018501
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software...
+SAT-ETL-Integrator: an extract-transform-load
+software for satellite big data ingestion
+Badr-Eddine Boudriki Semlali,a,* Chaker El Amrani,a and
+Guadalupe Ortizb
+aAbdelmalek Essa di University, LIST Laboratory, Faculty of Sciences and Techniques,
+Tangier, Morocco
+bUniversity of Cadiz, UCASE Research Group, Escuela Superior de Ingenier a, Cadiz, Spain
+Abstract. Satellite data are used in several environmental applications, particularly in air quality supervising, climate change monitoring, and natural disaster predictions. However, remote sensing (RS) data occur in huge volume, in near-real time, and are stored inside complex structures. We aim to prove that satellite data are big data (BD). Accordingly, we propose a software as an extract-transform-load tool for satellite data preprocessing. We focused on the ingestion layer that will enable an efficient RSBD integration. As a result, the developed software layer receives data continuously and removes∼86% of the unused files. This layer also eliminates nearly 20% of erroneous datasets. Thanks to the proposed approach, we successfully reduced storage space consumption, enhanced the RS data accuracy, and integrated preprocessed datasets into a Hadoop distributed file system.' 2020 Society of Photo-Optical Instrumentation Engineers (SPIE) [DOI: 10.1117/1.JRS.14.018501]
+Keywords: remote sensing big data; ingestion layer; extract transform load software; data integration.
+Paper 190597 received Sep. 5, 2019; accepted for publication Jan. 7, 2020; published online Jan. 25, 2020.
+1 Introduction
+Recently, the world has witnessed a great rise in industrial, agricultural, and transport activities. This development certainly helps to improve the economic and the social status of countries. But it also causes many environmental issues that affect the quality of human health and the safety of our planet, such as the appearance of the ozone hole, the increase in climate changes, and the degradation of air quality (AQ) by the emission of many anthropogenic pollutants, such as carbon monoxide (CO), carbon dioxide (CO2), nitrogenous oxides (NOx), and methane (CH ).1 Thus remote sensing (RS) techniques are one of the proposed solutions enabling a
+4
+near-real-time (NRT) tracking of the pollutant plumes emitted from the industrial and agricul- tural areas,2 ozone precursor estimation, aerosol optical depth (AOD) monitoring, and climate
+change monitoring. In addition, they provide a potential input data for AQ models.
+Generally, RS technique refers to the use of satellite data to measure ocean, Earth, and atmospheric components without making physical contact with them through the electro- magnetic energy (EME).3 At present, there are more than 3000 satellites in orbit4 used for many purposes, such as military, Earth observation, weather, and forecasting support. All of these satellites are equipped with manyactiveand/or passivesensors within different temporal, spatial, and spectral resolutions ranging from low to very high.5
+Basically, satellite sensors measure data, then the satellite processing unit corrects the erroneous data using specific algorithms including SPECAN and Doppler.6 Afterward, data are
+transmitted into ground stations through downlink channels to be distributed into a broadcast or a multicast.
+In this study, we collect data from the European Organization for the Exploitation of Meteorological Satellites (EUMETSAT) via the Mediterranean Dialogue Earth Observatory (MDEO) ground station installed at Abdelmalek Essa di University of Tangier in Morocco.7
+*Address all correspondence to Badr-Eddine Boudriki Semlali, E-mail:badreddine.boudrikisemlali@uae.ac.ma 1931-3195/2020/$28.00 ' 2020 SPIE
+We also acquired RS data from the Earth Observation System Data and Information System (EOSDIS) of the National Aeronautics and Space Administration (NASA), the Infusing Satellite Data into Environmental Applications (NESDIS) of the National Oceanic and Atmospheric Administration (NOAA), and The Copernicus Open Access Hub (previously known as Sentinels Scientific Data Hub) built and operated by the European Space Agency (ESA), provided complete, free, and open access to Sentinel-1, Sentinel-2, Sentinel-3, and Sentinel-5P user products, starting from the in-orbit commissioning review. The acquired RS data comes from many polar and geostationary satellites and various sensors.
+These data are stored in specific complex scientific file extensions: the binary universal form for the representation (BUFR) of meteorological data, the network common data form (NetCDF), and the hierarchical data format (HDF5). The daily volume of the received RS data reaches 40 gigabits (GB) and exceeds 15 terabits (TB) per year. Furthermore, the speed with which data are received is very fast, at a rate of 30,000 files per day. Accordingly, and according to attribute definition (venue, volume, variety, veracity, velocity, and so on), the data may be classified as big data (BD).8 Based on these aforementioned brief statistics, we are going to confirm that satellite data are BD.
+Consequently, remote sensing big data (RSBD) turns out to be an extremely challenging problem to be dealt with, including an efficient, rapid, and NRT processing. In addition, RSBD for environmental observation is regarded as a data intensiveprocess because thevolume, complexity, and the velocity exceed the usual processing systems and architectures.9
+For this reason, we have adopted the Hadoop BD architecture to split the problems of RSBD. The proposed design includes six interactives layers, which are the data sources, the ingestion layer, the Hadoop storage, monitoring layer, and the visualization layer. In this paper, we will focus only on the ingestion layer. This phase is very critical because it is responsible to collect unprocessed RS data, to manage enormous volume of input data, to extract, to filter, and to integrate refined RS data into a Hadoop Distributed File System (HDFS).
+As a result, the developed extract transform load (ETL) tool has efficiently processed and extracted potential values with high accuracy and with a low storage volume in a moderate execution time. Furthermore, the developed software has performed all steps automatically and processes global RS data.
+The remainder of this paper is organized as follows: Secs2, 3, and 4 enumerate, respectively, the issues, the main focus of this paper, and a review of some related works, Sec.5 presents the different aspects and characteristics of RSBD, Sec.6 goes into the details concerning the challenges of RSBD and explains the architecture developed for the ingestion layer, Sec.7 provides the results and discusses the experimental analysis.
+2 Issues
+RS data are widely used for several environmental applications, particularly in air pollution and climate change monitoring. However, the exploitation of these data contains many challenges, which are as follows:
+The specifications of RS data, including the venue, the volume, and the velocity are complex in terms of processing.
+Satellite data should be processed in NRT to keep their freshness.
+Satellite data sometimes contain errors, gaps, and invalid datasets. It is recommended to remove them before the storage step.
+The existing architectures and solutions have some limitations and drawbacks in RS data ingestion.
+3 Main Focus of This Paper This study has the following aims.
+Understanding the nature and the characteristics of the used satellite data and proofing that we are working with RSBD.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Journal of Applied Remote Sensing 018501-2 Jan Mar 2020 Vol. 14(1)
+Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software...
+Developing a software as an ingestion layers for RS data integration regarded as similar to an ETL tool which knows from data warehouse.
+Storing the refined RS datasets into an HDFS.
+4 Background and Related Works
+The general architecture of satellite data processing consists of three logical groups of servers: receiving servers, preliminary processing and thematic processing servers, and data storage servers accommodating large daily volume of data. There are some examples of the satellite data receiving platforms as follows:
+The Office of Satellite and Product Operation of NOAA.
+The EUMETCast service of EUMETSAT.
+The ground segment system developed by ESA within the European Remote Sensing program.
+The receiving servers collect the data in NRT from satellite without any modules of process- ing. For instance, there are as follows:
+The Fairbanks (POES) and the Wallops (GOES) grounds station of NOAA.
+The Command and Data Acquisition (Polar system) and the Primary Ground Station (Geostationary system) of EUMETSAT.
+The preliminary processing performs radiometric calibration of the received data using spe- cific software such as SPECAN and Doppler. This stage of processing provides data of level 1. We can site some of the existing satellites processing center in the world as follows:
+The Satellite Operation Control Center of NOAA.
+The Environmental Satellite Processing Center of NOAA.
+The Earth Observing System and Operation System of NASA. The Science Data Processing Segment of NASA.
+The Central Facility (CF) of EUMETSAT.
+The Data Processing Ground Segment of ESA.
+Second, the processing server provides refined products, particularly atmospheric chem- istry, atmospheric temperature, humidity, fire, smoke, and so on to the customers through a website interface. These platforms offer to the end users easy online searching, exploring, and filtering based on keyword, satellites, instruments, organizations, projects, processing level, and temporal and/or spatial delimiters. Moreover, they visualize datasets into interactive maps in NRT and make data available for downloading via file transfer protocol (FTP) or hypertext transfer protocol (HTTP) servers. The primary goal of these platforms is to maximize the scientific return for mission, research, and decision makers. All these services are free and open to all users for any scientific purpose. The following list includes some of the pioneer platforms.
+The Earth Science Data Systems Program of NASA.10
+The Comprehensive Large Array-data Stewardship System of NOAA.11 The Copernicus Open Access Hub operated by ESA.12
+The Product Navigator of EUMETSAT.13
+The finalstep of processing consists of storing the processed satellite data into data centers as data storage system group. There are four big satellite data centers in the world, which are:
+the EOSDIS of NASA,
+the NESDIS of NOAA,
+the EUMETSAT Data Center,
+the European Space Astronomy Centre Science Data Centre.
+Currently, RS data are widely used in many scientific disciplines such as environmental and social sciences. This has led to an increase of RS data that will continue to scale exponentially. Thus the processing of the RS data includes many challenges, beginning from the acquisition
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Journal of Applied Remote Sensing 018501-3 Jan Mar 2020 Vol. 14(1)
+Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software...
+to the visualization step,14 as follows: (1) satellite data are measured in NRT from satellite sensors, then transmitted to ground datacenters through downlinks, so the big protest is how to download these data from their sources within a high speed to keep their freshness. (2) Such data should be preprocessed inside an ingestion layer to be integrated into scalable servers with big storage capacity. (3) The treatment of RS data requires permanent and functional clusters; accordingly, this consumes more energy, so the electrical power should also be economized. (4) It is very possible to find many duplicated datasets, so the elimination of redundancy will help to hold only potential values. (5) In addition, satellite data are pervasive; they generate a huge volume of data with high velocity that storage system cannot continuously host, so it is necessary to remove old RS data by creating a model that decides which data to keep and which to discard. (6) Satellite data include many noisy and erroneous datasets due to the uncer- tainty of sensors. Accordingly, developing an efficient data-refining software will be beneficial for enhancing the satellite data accuracy. (7) RSBD processing demands some knowledge in probability and statistics in order to employ deep learning (DL), machine learning, and neural network algorithms to unlock new insights.
+Despite the existing aforementioned strong architectures, platforms, and systems from big organizations such as the NASA, NOAA, EUMETSAT, and the ESA, we can find some lim- itations and challenges of processing. In addition, sometimes their technologies are exceeded by the complexity and the huge volume of the acquired RS data.9
+RSdataprocessingisbecomingasignificantfieldofresearch. Manyinvestigationshavebeen made on different architectures. These research studies aim principally as follows:
+To optimize algorithms and processing patterns, JIN Hailiang combined the index and the Hibert curve to establish the index for the image data. Then the method of MapReduce parallel processing was used to write and query RS images. The experimental results showed that the method can effectively improve the data writing and query speed and has good scalability.15
+Toinclude parallel computingtechniques,16 tostoreandprocessRSBD withinadistributed Hadoop platform,17 and to manage RSBD with the streaming processing tools.18
+To propose a combination of streaming and MapReduce for analysis of time series data, they tested their proposal by applying the break detection algorithm BFAST to MODIS imagery. Then they evaluated the computing performance and requirements quality attrib- utes. Their results revealed that the combination of Hadoop and R can handle complex analysis of RS time series.
+To come up with an empirical model of DI index to estimate RS applications.9 Muhammad Mazhar designed a real-time BD analytical architecture for RS satellites applications (Rathore et al., 2015).
+Winda Astriani performed an ETL model to create multidimensional data cube. The ETL application of using Geokettle expected to provide data warehouse developers with per- forming automatic preprocessing data that allows regulating the insertion of new data and updating data without generating a lot of queries.19
+RS data are regarded as BD according to the attribute definition based to the eight salients (venue, volume, velocity, value, veracity, vocabulary, validity, and variety). So that adopting a BD analytics architecture is very crucial to make the processing efficient, to gain insights, and to make better decisions.
+Our study focuses mainly on air pollution and climate change monitoring requiring tremen- dous RS data coming in NRT from many satellites and sensors within different temporal and spatial resolutions (SPRs). The nature of these data is complex and their volume is huge.6 Thus building a BD architecture for RS data will help absolutely in data acquisition, filtering, storage, processing, and visualization.
+This paper introduces an ingestion layer as a software system consisting of different com- ponents which fill the gaps between external data sources and the HDFS. This software can be regardedas anETL for raster satellitedata, which allows efficienthandlingof acquired data from several sources and integrating them in an optimized way into an HDFS and separates storage issues from algorithm and application issues.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Journal of Applied Remote Sensing 018501-4 Jan Mar 2020 Vol. 14(1)
+Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software...
+5 Remote Sensing Big Data: Aspects and Specification
+This section describes the characteristics of the satellite data used in terms of volume, velocity, variety, and so on to demonstrate that RS data are BD.
+5.1 Satellite Big Data: Aspects and Features
+Generally, RS techniques are defined as the technologies measuring the surface, ocean, and atmospheric components without making a physical contact with it through EME20; satellites
+are regarded as the key instrument of this technique.
+A satellite can be defined as an artificial machine placed into a specific orbit; this orbit can be polar passing by Sun-synchronous orbits (SSO), which combines altitude and inclination in such a way that the satellite passes over any given point of the planet s surface at the same local solar time. Geostationary orbit is placed with an altitude of∼36;000 km directly over the equator and revolves in the same direction that Earth rotates (west to east). At this altitude, one orbit takes 24 h.21 We can cite three types of orbital altitude, which are the low earth orbit (LEO), the medium earth orbit, and the high earth orbit.22
+Satellites are equipped with passivesensors such as LIDAR, RADAR, scatter meter, sounder, and laser altimeter detecting sunlight radiation reflected from the earth and thermal radiation in the visible and infrared of the electromagnetic spectrum. In addition, they do not emit their own radiation but receive natural light and thermal radiation from the Earth s surface.
+The second type is the active sensors (e.g., radar and laser scanners) emitting an artificial radiation to monitor the earth surface or atmospheric features. Moreover, they do not depend on daylight and are minimally affected by clouds, dust, fog, wind, and bad weather conditions.5
+Furthermore, satellite sensors have other specifications, particularly the SPR, which means the Earth is surface-scanned by the instrument, ranging from low to very high.
+In addition, satellite sensors have a specific frequency to across the same geolocation, called the temporal resolution (TMR), which varies as high, medium, and low TMR.
+Satellite sensors continuously measure environmental variables and parameters. Afterward, the satellite processing unit corrects the enormous measured data using some algorithms includ- ing Doppler or SPECAN. This correction concerns the SPR and the geo-localization errors.6 Datawillbetransmittedintoantennasinground stationsthroughdownlink channels.Theground stations process RS data in order to remove imperfections, ensure geometric corrections, and apply data calibrations. This step will generate RS data of level 2 (L2) and level 3 (L3) of processing.
+In our research, we aim to apply RS techniques to track pollutant plumes emitted from indus- trial and agricultural activities, detect wildfires, monitor climate changes, and supply Moroccan forecasting agencies in NRTin order to prevent damages and help decision makers. In this inves- tigation, we collect data from the EUMETSAT via the MDEO ground station installed at Abdelmalek Essa di University of Tangier in Morocco.23 We also acquired RS data from the EOSDIS of NOAA, the NESDIS of NOAA, and the Copernicus platform.24
+From the statistical data in Table1 and according to Fig. 1, we can determine that there are manysourcesprovidingRSdatafrom varioussatellites(venue),wherein all ofthesesatellites are for environmental monitoring and meteorological application. These satellites are polar passing by an SSO excepting the geostationary Meteosat second generation (MSG).25 The majority of these satellites were launched in this last decade; for instance, the MetOp B in 2012,26 the Suomi National Polar-orbiting Partnership (NPP) in 2011, Sentinel-3A in 2016, and the Sentinel-5P in 2017.27 The MetOp C will be launched by the 2019. Their TMR is high, making 16 orbits daily within an average of 1 h of latency.28
+In our case study, the acquired RS data are stored in different scientificfile formats, including the BUFR, Binary, NetCDF, and the HDF5 (variety). These files have some special structure and models to store datasets (vocabulary). Furthermore, these channels afford an enormous file in NRT. We notice that the daily rate of MDEO is about 20,000 files, the NESDIS reaches 8000 files, the EOSDIS stretch 7000 files, and the Copernicus produces an average of 200 files (veloc- ity). The total amount of collected volume by the four sources sums up to about 37 GB per day andexceeds14TBperyear(volume).Inaddition,satellitedatahavebecomeveryusefulinmany
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Journal of Applied Remote Sensing 018501-5 Jan Mar 2020 Vol. 14(1)
+Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software...
+Table 1 Sources channel and characteristics of the used satellite data in the case study.
+
+Organization
Satellite (sensors)
Product name
Latency (min)
File is format
(Files/ day)
Data amount (MB/day)
Copernicus
Sentinel 3 (OLCI)
Sentinel-3
15
NetCDF
41
14,000
Copernicus
Sentinel5P (TROPOMI)
Sentinel-5P
15
NetCDF 8
5
4400
MDEO
MetOp (IASI, AMSU)
EPS-Africa
30
BUFR, Bin
9000
2200
MDEO
MetOp (ATVOS)
EPS-Global
30
Bin
1000
180
MDEO
MSG (SEVIRI)
Data_Channel_3
30
GRIB,HDF5
300
240
MDEO
NPP (OMPS, VIIRS)
NPP-3
30
NetCDF,Bin
1000
1100
MDEO
MetOp (GOME-2)
SAF-Africa
30
BUFR, HDF5
2000
700
MDEO
MetOp (ASCAT, GOME-2)
SAF-Europe
30
BUFR, Bin,
+HDF5
5000
3800
NASA
AQUA (AIRS)
AIRS2SUP_NRT.006
15
HDF5
640
5400
NASA
AQUA (AMSU)
MCDAODHD
360
HDF5
4
4
NASA
AURA (MLS)
ML2CO_NRT.004
15
HDF5
90
25
NASA
AURA (MLS)
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Journal of Applied Remote Sensing 018501-6 Jan Mar 2020 Vol. 14(1)
diff --git a/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt b/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt
new file mode 100644
index 0000000..52041a6
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt
@@ -0,0 +1,107 @@
+www.nature.com/scientificreports/ www.nature.com/scientificreports
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+www.nature.com/scientificreports
+
+
+
+
+open Automated data cleaning of
+paediatric anthropometric data from longitudinal electronic health records: protocol and application to a large patient cohort
+Hang t. t. phan1,2 ✉, Florina Borca2,3, David cable3, James Batchelor1,2, Justin H. Davies3,4 & Sarah ennis1,2,4
+‘Big data’ in healthcare encompass measurements collated from multiple sources with various
+degrees of data quality. these data require quality control assessment to optimise quality for clinical management and for robust large-scale data analysis in healthcare research. Height and weight data represent one of the most abundantly recorded health statistics. the shift to electronic recording of anthropometric measurements in electronic healthcare records, has rapidly inflated the number of measurements. WHO guidelines inform removal of population-based extreme outliers but an absence of tools limits cleaning of longitudinal anthropometric measurements. We developed and optimised
+a protocol for cleaning paediatric height and weight data that incorporates outlier detection using robust linear regression methodology using a manually curated set of 6,279 patients’ longitudinal measurements. The protocol was then applied to a cohort of 200,000 patient records collected from 60,000 paediatric patients attending a regional teaching hospital in South England. WHO guidelines detected biologically implausible data in <1% of records. Additional error rates of 3% and 0.2%
+for height and weight respectively were detected using the protocol. Inflated error rates for height measurements were largely due to small but physiologically implausible decreases in height. Lowest error rates were observed when data was measured and digitally recorded by staff routinely required
+to do so. the protocol successfully automates the parsing of implausible and poor quality height and weight data from a voluminous longitudinal dataset and standardises the quality assessment of data for clinical and research applications.
+With the availability of digital electronic health systems, ‘big’ clinical data has become more accessible to the research community1,2. The big data era, which includes using data obtained from heterogeneous digital sources, has enabled novel opportunities for conducting empirical clinical research. At the same time there are challenges using such data for research purposes, including the need to adapt existing and develop new methodologies to cope with the scale and complexity of the data3. However, a more fundamental issue for researchers is the require- ment to undertake data cleaning, as incorrect clinical measurements entered into an electronic health record (EHR) will significantly affect the quality of dataset. Data cleaning can be time-consuming and involve multiple stages including detailed data analysis to identify error types, data inconsistencies, outlier detection and imple- ment data transformation where required4,5. Thus, developing automated methods for data cleaning is desirable.
+Height and weight are the most commonly recorded anthropometric measures for the assessment of child health in both clinical practice and research studies. Longitudinal height measurements give an indication of well-being and perturbations may be an indication of nutritional, endocrine, cardiac or other abnormalities that should prompt a clinical decision for investigation or intervention. Body mass index (BMI), defined by heights
+1NIHR Southampton Biomedical Research Centre, University Hospital Southampton, Southampton, UK. 2University of Southampton, Southampton, UK. 3University Hospital Southampton NHS Foundation Trust, Southampton, UK.
+4These authors contributed equally: Justin H. Davies and Sarah Ennis. ✉e-mail: hang.phan@soton.ac.uk
+and weights, may be used to establish risks of prevalence of diseases6. In children, longitudinal changes of BMI provide insight into predisposition to health problems such as obesity, hypertension, type 2 diabetes and nutri- tional insufficiency.
+World Health Organisation (WHO) guidelines 7 can be used to exclude biologically implausible values (BIV) from the EHR for childhood height, weight and BMI data, by converting the measurements to standard deviation scores (SDS) and using defined parameters to exclude extreme values (e.g. height to age z-score (HAZ) exclusion if < −6 or >6). However, there are few studies which have evaluated methods for cleaning periodical longitu- dinal anthropometric data 8. For example, some have identified BIVs for annual longitudinal values where the mean changes of BMI values exceed 3SDS or −3SDS and height decrements greater than 1 inch/year, and mean increases in height> 3SDS9,10. Others10 have suggested removing weight measurements where annual changes exceed 22.7 kg or 27.2 kg if the individual was severely obese at baseline, any height decrease and any height increase > 15 cm a year. These methods were developed for identifying extreme changes in periodical measure- ments and do not detect less extreme changes and so are not applicable to children where growth is dynamic. Neither are they applicable to the big-data scenario where anthropometric measurements are non-periodical. More recently the jack-knife residual method, applicable to paediatric patients with ≥4 datapoints, was suggested and applied to a paediatric anthropometric dataset for children ≤2 years old11. Although simple to use, it can be too strict in defining the range of plausible values hence not allowing more pronounced fluctuations in longitudi- nal data that are typical in the paediatric clinical setting where an individual can reduce or gain significant weight during or after a treatment period12,13.
+University Hospital Southampton (UHS) is a large teaching and research hospital serving a population of nearly 3.5 to 4 million people in South Hampshire. The Southampton Children’s Hospital of UHS initiated elec- tronical recording of anthropometric measurements in 2012 and subsequently developed an Electronic Growth Chart (EGC) which was rolled out for use across departments in the hospital in 201314. Since then, anthropomet- ric data on children has been systematically recorded, improving the accuracy of growth data presentation on a growth chart and enhancing the experience of sharing growth data by clinicians between paediatric specialities. It has also presented an opportunity for research studies to use longitudinal routine patient care anthropomet- ric data and make correlations between childhood growth and development of disease or efficacy of therapy. However, data recorded for routine clinical care by end-users can be prone to typographical or default value entry errors often related to time pressure for care delivery. Hence it is necessary that the anthropometric data be cleaned and processed before it is used for research purposes.
+In this study, we developed an automated protocol for identifying outliers of longitudinal routine paediatric height and weight measurements using state-of-the-art outlier detection methods. Concurrently, a subset of UHS electronic paediatric height and weight data of patients aged 2–20 years old, the gold-standard dataset manual curated for parameter optimisation, were assessed for data quality. We demonstrate how dataset scrutiny can identify and target training needs in anthropometric assessment in a teaching hospital.
+Materials and methods
+Anthropometric data scope and extraction. Electronically recorded height, weight measurements and date of birth was extracted for all patients admitted to UHS from 1932–2018 where the patient’s age at date of meas- urement was between 2–20 years. Data prior to 2008 were paper-based archived data transcribed into the elec- tronic EPR system since its introduction in UHS. Measurements are recorded to an accuracy of 1 decimal place for weight (kg) and height (cm). The occupation and department of the staff members entering the data was also cap- tured. Measurements of children of age less than 2 years were not considered in this assessment as the absence of gestational age data prevented accurate calculation of height for age z-scores (HAZ), weight for age z-scores (WAZ) and weight for height z-scores (WHZ). From the raw measurements of height (H, metre) and weight (W, kg),
+BMI was calculated as W/H2 and HAZ, WAZ and WHZ were calculated using the LMS method15.
+Data quality indicators. In assessing the quality of the captured anthropometric height and weight meas- urements, established data quality indicators for children ≥ 2 years of age were applied: (i) standard deviation (SD) of HAZ, WAZ and WHZ16 (ii) Myer’s Index (MI) for height and weight where MI is a measurement of digit preference of recorded data17. Myer’s Index calculates the divergence in the frequency of the ending digit in the measurements compared with the expected uniform distribution where there is no digit bias. The higher the value, the more biased the measurement towards a digit or two in all measurements, reflecting rounding effects.
+Conventional data cleaning. The thresholds for normal ranges of HAZ, WAZ and WHZ specified by the WHO Child Growth Standards 18 were applied for height, weight and BMI measurements. Those satisfying the
+condition of HAZ, WAZ or WHZ being within the [−6,6], [−6,5] and [−5,5] ranges respectively were retained for further analysis.
+Implausible flagging of sparse data. When longitudinal measurement data were sparse e.g. the number of entries per individual was less than four, an implausible increment or decrement flag was applied e.g. gain or
+loss of >25% of weight within one day; gain or loss of >40% of weight within three months; gain or loss of >50% of weight within one year; gain of >15% of height within three months; any decrease in height exceeding 1 cm
+were flagged for manual checking.
+Outlier flagging method for longitudinal data. For outlier flagging of longitudinal anthropometric measurements, robust regressions of the linear regression methodology was adopted19. Robust regressions can handle multiple outliers by introducing residual statistics including influence measurements such as Cook’s dis- tance, DFFITS, DFBETAS20 (see Supplementary for method details). Datapoints with influence statistics exceeding suggested thresholds are temporarily removed from the inference and the regression parameters are re-estimated
+from the remaining data. This results in a regression line that best fits the most reliable data. It is this regression line that is used to discriminate outlying datapoints from the entire set of datapoints using the SD fold threshold θ.
+Additional checks on height data. In addition to robust regression analysis of the data to detect outli- ers, height measurements were additionally inspected to flag anomalies such as variation in adult height and/or
+height decrease over time as follow. Final adult height is generally reached at approximately 18 years21, therefore, variation >1 cm from the median height measurements of patients older than 18 years flagged an error in data
+recording. Additionally, any decrease in height exceeding 1 cm also prompted a flag to cross check recorded data manually. This check was applied regardless of the number of datapoints in any set of measurements.
+Details of the overall longitudinal height and weight data outlier flagging protocol is summarised in Box 1.
+Box 1 Summary of final protocol for outlier flagging for longitudinal height and weight measurements of a patient
+1. Flag data not satisfying WHO guidelines for heights, weights and BMIs whose SDS values fall beyond the ranges [−6,6], [−6,5] and [−5,5] respectively, remain n datapoints
+2. If n < 4: assess the implausible increments/decrements of height and weight measurements:
+i. For weight: for each pair of consecutive measurements, use the following method to flag extreme changes as below:
+• Time span ≤ 1 day: beyond ±25%
+• Time span ≤ 3 months: beyond ± 40%
+• Time span ≤ 1 year: beyond ± 50%
+ii. For height
+• If time span ≤ 3 months, height increase is ≥15%
+• If height measurement at time point is at least 1 cm smaller than time point, flag data at time point.
+3. With the remaining data, where n > =4:
+a. Apply the ordinary least square (OLS) linear regression method of the SDS values as a linear function of age (number of variables k = 1)
+b. Calculate influence values: Cook’s distance, dffits, dfbeta for age. Retain data that have Cook’s distance <1, |dffits | <2 and | dfbeta_age | <2/ to re-estimate the regression line and obtain the SD
+of the residuals. c. Any patient whose SD of the residuals for height or weight larger than 0.47 or 0.76 respectively has their whole series of measurements flagged for manual inspection. d. Where the SD of the residuals for height or weight is ≤1, flag any individual datapoint with resid- ual error exceeding θ x SD where θ is 2.9 for weight and 2 for height (as informed by parameter tuning). e. For height data:
+i. Perform adult height check: for age measurements not flagged in (2c) within the range 18–20 years, calculate median value for that individual Mh, and flag as outlier any height measure- ment difference exceeding 1 cm.
+ii. Across all age ranges and for data not already flagged, perform height decrease check. If height measurement at time point is at least 1 cm smaller than time point, flag data at time point.
+4. If the total number of datapoints flagged (by any step) exceed 40% of the longitudinal data, the whole series of longitudinal data is flagged for manual inspection.
+parameter tuning. Typically, datapoints exceeding 2 times the SD (θ) of any series of measurements are nominally flagged as outliers, corresponding to an outlier rate of 5%22. However, for voluminous datasets of
+growth data in children, this parameter may be unnecessarily stringent. The tuning of θ was facilitated by a ‘gold-standard’ dataset from UHS, manually curated by an endocrinologist (JHD), where each patient had ≥7 datapoints (Supplementary text). This gold-standard dataset consisted of 6,279 patients with 89,258 weight meas- urements and 4,396 patients with 55,688 height measurements. Of these, 208 (0.23%) weight and 302 (0.54%) measurements were deemed ‘implausible’ by the endocrinologist. Additional height checks identified a further 191 (0.34%) height measurements failing the adult height check and 1,237 (2.22%) flagged by the height decrease
+
+(a) Contingency table of weight outlier flagging
(b) Contingency table of height outlier flagging
Weight θ = 2.9
Manual curation by clinician
Height θ = 2
Manual curation by clinician
Impossible
Plausible
Impossible
Plausible
Flagging by protocol
Outlier
189
2,110
2,299
Flagging by protocol
Outlier
1,694
2,775
4,469
Plausible
19
86,940
86,959
Plausible
36
51,183
51,219
208
89,050
89,258
1,730
53,958
55,688
Sensitivity = 90.87%
Sensitivity = 97.91%
PPV = 8.22%
PPV = 37.91%
+Table 1. Contingency tables for chosen values of θ for weight and height and their sensitivity and PPV#. #PPV is Positive Predicted Value, defined as the proportion of positive results that are true positive, PPV = TP/
+(TP + FP).
+
+Figure 1. Percentage of datapoints identified as true errors in the gold standard dataset stratified by year for weight and height, weight for height. Outliers were split into three types: height outlier flagging using linear regression (LR), height entry error with adult height check and height with height decrease check.
+check, totalling 1,730 flagged height measurements (3.11%). This yielded a gold-standard dataset with a defined set of ‘true’ errors.
+Sensitivity and specificity metrics were evaluated for θ ∈ [1.5,5.5] using the gold standard dataset. Here, a true positive (TP) was defined as a datapoint identified as an outlier that was deemed clinically implausible by the clinician, a true negative (TN) was a value that was not flagged as an outlier by our method and identified as plausible by the clinician, a false positive (FP) was a true plausible value wrongly flagged as an outlier, and a false negative (FN) was a truly implausible value not flagged as an outlier by the protocol. Therefore, the positive pre- dictive value (PPV) is an important metric to consider. Ideally, any given protocol should maximise the number of true outliers as a proportion of all data flagged for manual review while maintaining good sensitivity to detect all true outliers.
+The gold-standard UHS data were used to calculate sensitivity and PPV for θ ∈ [1.5,5.5] (Fig. S4). For both height and weight, it was desirable to maintain sensitivity above 0.9 while maximising the PPV. Hence for height,
+the typical value of θ = 2 was selected but for weight measurements, it was observed that increasing θ to 2.9 main- tained sensitivity above 0.9 but had a dramatic effect on reducing the manual curation of false positive outliers (Table1). These values were used in the final protocol described in Box 1.
+The final selected values of θ were applied to gold standard data sets for height and weight respectively. From 55,688 height measurements, a subset of 4469 measurements (representing 2635 patients) were flagged as out- liers for manual inspection. Approximately 92% of the data passed checks and could be automatically classified as plausible. Of the 8% of flagged measurements, the 1237 (2.2%) due to decreases in height may be excluded without further clinical review and only 5.8% of the data may be subjected to further expert review or excluded depending on application. Importantly, the protocol failed to flag 36 measurements across 25 patients that the clinician subsequently flagged as implausible. This represented 0.06% of possible erroneous measurements that would go undiscovered by automated cleaning. Similarly, for weight, 2299 (2.6%) measurements from 1875 patients were flagged as requiring manual expert review while 97.4% of the data passed automated checks. Only nineteen datapoints (0.02%) that were deemed by the clinician as implausible were missed by the protocol.
+All the data processing and protocol implementation was performed using the open-source programming language Python version 3.723. The ordinary least square method OLS from the Python package statsmodel24 was used to perform LR. The script for calculating SDS values of anthropometric measurements and outlier
+
+Figure 2. Manual outlier curation results of UHS gold standard paediatric height and weight data: (a) Percentage of outliers for each of the occupation categories for weight, height using LR, height with adult height check, and height with height decrease check. (b) Percentage of outliers for each of the department categories for weight, height using LR, height with adult height check, and height with height decrease check.
+detection described by the pipeline is available for use from https://github.com/hangphan/peanof/. This includes the portable Docker container25 where all dependencies required for running the script were set up and ready to be executed on any environment where Docker is made available.
+Ethics and information governance. The study was approved by the IG management team of the University Hospital of Southampton (UHS). Ethics approval from the Research Ethics Committee and Health Research Authority, and informed consent was waived by the internal review board at the R&D Department of UHS as this is a combination of an Audit against WHO guidance and Service Evaluation. The anthropometric data in UHS were retrospective data and anonymised. All methods used in this study were performed in accord- ance with the relevant guidelines and regulations.
+Results
+Data quality of gold-standard longitudinal data. The ‘gold-standard’ UHS height and weight data- set enabled assessment of true data quality. Chronologically, both height and weight measurements across the 2008–2018 were stable with an error rate of ~3% for height and 0.2% for weight (Fig.1). The discrepancy in error rates between the two measurements was largely attributable to decreases in height which were deemed physio- logically impossible.
+Outlier rate by occupation was highest in the Pharmacist group (0.27%) followed by Others (0.20%) and Dietician (0.16%) for weight. The Pharmacist group recorded the most errors in height as assessed through man- ual review (2.4%) and using the adult height check (5.7%, Fig.2a). This likely reflects the pharmacist’s focus on estimated weight and not height for prescribing purposes.
+By department, the Others group has the highest error rate for weight (0.48%) followed by Dietetics/Speech and Language Therapy and Paediatric Neurology (0.16%, Fig.2b). For height data, the highest rate of data deemed implausible though manual review was observed in Dietetics/Speech and Language Therapy (0.63%) followed by Paediatric Medicine (0.44%) and Paediatric Oncology (0.40%). Additional height checks saw the highest combined error rate in Dietetics/Speech and Language Therapy (2.05%) followed by Paediatric Oncology (1.25%, Fig.2b).
+Application of automated cleaning protocol to the entire UHS paediatric height and weight dataset (n = 68,595 patients). UHS data summary and characteristics. The entire cohort contained all
+records for patients aged 2–20 years, dating from 1932 to 31/12/2018. A total of 214,983 weight measurements (68,273 patients) and 146,635 height measurements (47,616 patients) were obtained for 68,595 paediatric patients in the UHS EPR (Fig.3a), resulting in 142,643 BMI values (46,479 patients).
+The number of records was low prior to 2008 (1932–2008) and increased from 2008, reflecting the gradual introduction of EPR system into UHS departments, with a sharp increase in 2014 when the EGC was introduced at the end of 2013 (Fig.3b). The number of weight measurements recorded was about 30% higher than that of height during 2014–2018 period. Additional description regarding age group at initial measurement, length of follow-up time is presented in Supplementary (Fig. S4a,b).
+Patients were grouped by their respective number of longitudinal height and weight measurements. There is an excess of patients with a single measurement entry and these represent approximately half of the cohort, reflecting paediatric patients with a single hospital visit to departments such as emergency. Patients with ≥7
+entries for height and weight represented ~10% of the cohort but contributed almost half of the entire dataset for both height and weight (Fig.3d,e). These represent the patient population whose ill health may confer growth and developmental irregularities requiring frequent monitoring.
+
+Figure 3. UHS age 2–20 years’ height and weight data (1932–2018) summary: (a) Number of patients and records of height and weight, broken down by number of datapoints per patients. (b) Total number of height, weight and BMI measurements over time from prior to 2008 to 2018 (c) Percentage of data flagged by WHO guidelines over time. (d) Number of patients within groups of patients defined by their number of longitudinal datapoints for height and weight. (e) Number of height and weight records per group of patients binned by number of datapoints per patient.
+
+Figure 4. One decimal place digit distribution for height and weight measurements, demonstrating the bias in recording height and weight measurements, rounding to the precision of kg for weight and the precision of cm or 0.5 cm for height. This bias is reflected in the Myers’ index of height and weight measurements.
+
+
WAZ
HAZ
WHZ
DHS RANGE OF SD
1.01–1.49
1.08–2.33
1.01–2.02
PRE-WHO PROCESSING SD
5.29
5.90
15.55
POST-WHO PROCESSING SD
1.45
1.32
1.36
+Table 2. Standard deviation of WAZ, HAZ and WHZ of the UHS 2–20 anthropometric measurement data.
+
+Figure 5. UHS data characterisation by occupation and by department of staff entering the data (a) Weight records by occupation (b) Height records by occupation (c) Percentage of height and weight data flagged by WHO rules by occupation (d) Weight records by department (e) Height records by department (f) Percentage of height and weight data flagged by WHO rules by department.
+Data quality by conventional quality indicators. The number of records failing WHO child growth standard guidelines for weight, height and BMI measurements were 1,386 (0.95%) and 814 (0.38%) and 677 (0.47%) respectively. The percentage of records excluded based on WHO limits was highest in 2013 at 2.37%, 2.64%, and 2.71 for weight, height and BMI respectively (Fig.3c). This coincides with the gradual introduction of EGC into various departments across UHS in 2013, reflecting a transient increase in error rate during the transition period to the electronic recording of data. A comparison of the five years preceding the transition to electronic data recording and the five years following 2013 identified a significant reduction (p = 9.97 × 10−23, p = 1.05
+weight height
+× 10−8) in these extreme data recording errors.
+The SD of HAZ, WAZ and WHZ was calculated and compared against reported ranges of SD observed in the 52-country DHS survey16 (Table2). The SD values prior to exclusion of WHO extreme datapoints fell significantly outside the expected ranges. However, after exclusions of these extreme values, the observed SD values for height, weight and BMI z-scores fall within the expected limits.
+The Myer’s Index (MI) for digit preference of height data (excluding WHO extreme values) is consistent with the average observed across 51 countries in the DHS survey (MIUHS = 17.91, MI = 17.8, Fig.4). The
+MI for weight data is higher (MIUHS = 10.69, MI51_country_average = 4.6) suggesting a51_cogreunatt erry_a tveneragdene cy for estimation in UHS weight data.
+Data quality indicators by occupation and department of entry staff. The quality of the extracted data was also scrutinised by staff occupation and department to understand the most likely source of erroneous data and target the training in anthropometric assessments.
+For 75% of the observed data, the occupation and department of the staff member entering the data was available for evaluation. Ninety-three different staff occupations across 96 different departments were noted and the ten staff occupations that most frequently entered height and weight measurements are presented in Fig. 5a,b. Healthcare assistants most frequently recorded weight and height data (24% and 30% respectively) followed by Healthcare support workers, Staff nurses and Consultants.
+Application of the WHO flags for extreme values identified a low and consistent level of less than 1% of likely data entry error across occupations (Fig. 5c). The most striking peak in this type of error was 7.5% noted in the height data entered by pharmacists. However, given pharmacists entered only a very small proportion of the overall height data (n = 214 records) this higher error rate reflects a very small number (n = 16) extreme values.
+The Paediatric outpatient department contributed most data for weight and height measurements (47% and 58% respectively; Fig.5d,e). The WHO violation rate by department was small and relatively consistent across departments. The highest rate identified was 1.2% amongst weight values recorded within the Paediatric Endocrinology department (Fig.5f).
+Outlier detection for patients with longitudinal records in UHS dataset. For those with 2–3 height measurements, the implausible flagging method identified 655 (2.21%, 607 patients) height decreases >1 cm (Table3). No height
+
+Patient group
Filter
Weight
Height
All
WHO
1,386 (n = 864)
814 (n = 527)
2–3
Extreme change
119 (n = 114)
655 (n = 607)
4–6
OLS robust, few remain
680 (n = 170)
292 (n = 73)
Large SD
114 (n = 24)
296 (n = 61)
LR
3,626
+(n = 3,531)
3,029
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Scientific RepoRtS | | https://doi.org/10.1038/s41598-020-66925-7 8
diff --git a/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt b/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt
new file mode 100644
index 0000000..9d13890
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt
@@ -0,0 +1,108 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+
+See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/342834960
+Cross-Scenario Performance Modelling for Big Data Ecosystems
+Chapter · July 2020
+DOI: 10.1007/978-3-030-50334-5_14
+CITATIONS READS
+0 47
+2 authors, including:
+Fatimah Alsayoud
+Arab Open University - Saudi Arabia
+5 PUBLICATIONS 2 CITATIONS
+SEE PROFILE
+All content following this page was uploaded by Fatimah Alsayoud on 08 March 2023.
+The user has requested enhancement of the downloaded file.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Metadata of the chapter that will be visualized in SpringerLink
+
+Book Title
Artificial Intelligence in HCI
Series Title
Chapter Title
Cross-Scenario Performance Modelling for Big Data Ecosystems
Copyright Year
2020
Copyright HolderName
Springer Nature Switzerland AG
Author
Family Name
Alsayoud
Particle
Given Name
Fatimah
Prefix
Suffix
Role
Division
Department of Computer Science
Organization
Ryerson University
Address
Toronto, Canada
Email
Corresponding Author
Family Name
Miri
Particle
Given Name
Ali
Prefix
Suffix
Role
Division
Department of Computer Science
Organization
Ryerson University
Address
Toronto, Canada
Email
Ali.Miri@ryerson.ca
Abstract
Performance prediction is an essential aspect of several critical system design decisions, such as workload scheduling and resource planning. However, developing a model with higher prediction accuracy is a challenging task in big data systems due to the stack complexity and environmental heterogeneity. Workload modelling aims to simplify the connection between workloads factors and performance testing. Most of the workload models rely on a single scenario under test (SUT) method, where the trained and the evaluated data have the same distribution. However, a single SUT is not the ideal modelling method for big data workloads, as SUTs change frequently. Big data systems have a considerable amount of possible test scenarios that are generated from changing one or more elements in the testing environment, such as changing benchmarks, software versions, or cloud service types. To address this issue, we propose a cross- Scenario workload modelling method that aims to improve the workloads’ performance classification accuracy. The proposed approach adopts the Transfer Learning concept for reusing models cross different but related scenarios. In this work, we evaluate the proposed approach on multi real-world scenarios in Hadoop which is an example of big data system. The empirical results showed that the proposed approach is more accurate than SUT method.
Keywords
Performance - Modelling - Transfer learning - Big data ecosystems
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+Cross-Scenario Performance Modelling
+for Big Data Ecosystems
+Fatimah Alsayoud and Ali Miri (B)
+Department of Computer Science, Ryerson University, Toronto, Canada
+Ali.Miri@ryerson.ca
+Abstract. Performance prediction is an essential aspect of several crit-
+ical system design decisions, such as workload scheduling and resource
+planning. However, developing a model with higher prediction accuracy AQ1 is a challenging task in big data systems due to the stack complexity and environmental heterogeneity. Workload modelling aims to simplify the
+connection between workloads factors and performance testing. Most of
+the workload models rely on a single scenario under test (SUT) method,
+where the trained and the evaluated data have the same distribution. AQ2 However, a single SUT is not the ideal modelling method for big data
+workloads, as SUTs change frequently. Big data systems have a consid-
+erable amount of possible test scenarios that are generated from chang-
+ing one or more elements in the testing environment, such as changing
+benchmarks, software versions, or cloud service types. To address this
+issue, we propose a cross-Scenario workload modelling method that aims
+to improve the workloads’ performance classification accuracy. The pro-
+posed approach adopts the Transfer Learning concept for reusing models
+cross different but related scenarios. In this work, we evaluate the pro-
+posed approach on multi real-world scenarios in Hadoop which is an
+example of big data system. The empirical results showed that the pro-
+posed approach is more accurate than SUT method.
+Keywords: Performance · Modelling · Transfer learning · Big data ecosystems
+1 Introduction
+Big data ecosystems have become the main element in today’s technology. The ecosystems support big data sets and provide a variety of execution methods to meet system workload requirements. Big data ecosystems contain heterogeneous hardware and software, and they support a variety of data and workloads.
+Designing optimal management policies and actions for big data ecosystems requires active monitoring and intelligent modeling. The model deign to test a particular objective like performance. Modeling for performance testing is one of the most successful management analyzing approaches. It can be used to measure the performance of a specific system object or a specific executing workload. In
+ c Springer Nature Switzerland AG 2020
+H. Degen and L. Reinerman-Jones (Eds.): HCII 2020, LNCS 12217, pp. 1 18, 2020. https://doi.org/10.1007/978-3-030-50334-5_14
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Cross-Scenario Performance Modelling for Big Data Ecosystems 7
+both cases, the performance testing design is impacted by the characteristics of the running workloads. For example, a Hard Disk Drive (HDD) delivers its best performance when it serves sequential access workloads and not random access workloads. Another example is that the Hadoop ecosystem performs better with analytic workloads than Online Transaction Processing (OLTP) workloads.
+Workload performance modeling provides an approach to examine perfor- mance on a particular Scenario Under Test (SUT), where the scenario can include the deployment solution, the software version or the benchmark setup of a par- ticular Object Under Test (OUT). An example of OUT is Application Under Test (AUT). In general, the model result is a significant input element on many system decisions such as resource allocation. Therefore, it is crucial to design an accurate workload model as the performance test results reliability level is in line with the model accuracy.
+Designing an accurate workload model for big data ecosystems is a chal- lenging task due to ecosystem complexities and heterogeneity. There are several possible SUTs and lots of different case studies in big data ecosystems. For example, it is typical for the same ecosystem to have multi software versions, test workload performance with different benchmarking tools and to be executed on various deployment solutions [1].
+Different SUTs produce dissimilar workload distributions. Many workload modeling approaches assume that trained and evaluated data has a similar dis- tribution which is the same assumption as ML methods [2]. This assumption does not fit with big data ecosystem characteristics where the workload’s distribution is changed with many possible SUTs. Constructing a model for each SUT from scratch is time-consuming and resource intensive. A similar distribution assump- tion does not work well in many real-life cases. For example, in computer vision, there is a need to recognize numbers either coming from handwritten data or from a picture where they have dissimilar distributions.
+A number of deep learning related methods such as Transfer Learning (TL) are developed to deal with the distribution similarity constraint. TL provides a method to transfer knowledge between domains with a dissimilar distribution or dissimilar feature space to avoid building a fresh model every time the SUT is changed and to improve the model’s accuracy. It is a well-used method in computer vision and natural language processing researchers. In this work, we will use TL to improve the performance model in a big data ecosystem.
+1.1 Problem Statement and Motivation
+The need for an accurate performance model remains even when the SUT or the executing workload is changed in a big data ecosystem. Designing an accurate model for a big data ecosystem such as Hadoop while considering SUT and workloads changing is a challenging task. Although there is a lot of Hadoop performance modelling work such as [3,4] and [5], most of it focuses on a single SUT. Only some consider multi SUT. For example, [6] provide a comprehensive analysis of how the workload behaviour, characteristic and distribution changes with SUTs change, and [7] designed a map task scheduling model for multi
+cloud service under test. However, none of the work considers improving the performance model for a particular SUT by utilizing another SUT model.
+In practice, users typically change the setups to meet individual or application needs. For example, a big data ecosystem may be moved from on-premise to the cloud when there is a need for more storage. Another example is changing the benchmark measurement tool to analyze different SW elements. Although SUTs usually change frequently on a big data ecosystem, the scenarios modification factors have not been considered on the big data performance modelling yet.
+In this paper, we investigate the accuracy of a big data ecosystem perfor- mance model with the proposed cross-scenario transfer approach. This approach builds a performance model based on a particular SUT (Scenariosrc ) and then transfers the source knowledge into another SUT (Scenariotgt ) to improve the target model’s accuracy. A cross-scenario transfer approach adopts the inclusion method (multi scenarios) instead of the isolation (single scenario) method that is used by most existing performance modelling approaches. The inclusion method relaxes the sensitivity between model accuracy and the SUT characteristic. We demonstrate the approach with four scenarios: benchmarks, cloud service types, and Hadoop versions each with a couple of hypotheses. The experiential results show noticeable model accuracy improvement on the Scenariotgt with the pro- posed approach.
+The paper is organized as follows. Sections 2 and 3 give a background of work- load modelling and performance modelling challenges. The proposed approach overview is presented in Sect. 4. The evaluated case studies and the experimen- tal result are discussed in Sect. 5. Finally, related work and the conclusion are presented in Sect. 6 and Sect. 7, respectively.
+2 Workload Modelling
+In general, modelling provides a foundational methodology to abstract and rep- resent a particular aspect or relationship. Workload modelling establishes a con- nection between the workload characterization and the desired testing object. It helps to track how the workload and the corresponding testing object are changing. There are several possible algorithms for workload modelling such as predication, evolution, optimization and simulation. The algorithm is selected based on the model’s objective. It is important to select the right design factors and define an accurate workload model. This is because many critical manage- ment decisions are using it as one of their fundamental elements.
+Today’s big data ecosystems serve a variety of workload types such as Online Transaction Processing (OLTP), Decision Support System (DSS), analytical and Machine Learning workloads. Each type has unique attributes and characteriza- tion. Moreover, the workload’s pattern, behaviour and distributions change with the execution environment. Workload behaviours are very sensitive to execution environment components, setups and capability.
+Workload modelling provides a method to simplify the relationship between workload characterization and behaviours with the desired testing object for a
+particular testing environment [8]. The testing object is the workload attributes that the model is designed to test it, such as performance, cost and resource utilization. The object measurement metric defined during the model construc- tion is based on the final objective. For example, performance can be measured based on the workload’s execution time or the throughput. Another essential aspect of workload modelling is the testing environment that affects workload behaviour and testing object values. In general, the model design is based on data from an environment with an aggregation of SWs and HWs. However, usu- ally only one of the environmental elements is used to define the testing factors. For instance, in the application performance model, the application represents the testing environment and performance represents the testing object. Usually, the test application is called Application Under Test (AUT). The application performance model or workload model for performance testing investigates the relationship between application workloads and the corresponding performance.
+Each aspect of the workload model should be designed and selected care- fully since the accuracy of the design affects the accuracy of many management decisions and actions. The model can be used for descriptive, predictive and prescriptive analytics where the analytics output, for example, produces perfor- mance insight or predicts resource provisioning. The workload model can also be used for simulating workloads [9] and evaluating a system configuration [10]. Indeed, the workload-aware concept becomes a common aspect of different man- agement architecture.
+Workloads have different behaviours and patterns that change based on many factors like workload structure and the testing environment. For example, the behaviour of database workloads is different than the ML workloads. The last one is more complicated, requiring more resources and taking more time than the first one. The challenge occurs when a particular environment serves both types of workloads which is a normal situation in today’s applications. The workload- aware concept is adopted on the system to serve each workload with its need, and define the management decision and action differently for each workload.
+3 Big Data Performance Modelling Challenges
+Modeling big data workloads for performance testing or in short performance modelling is a challenging task due to the ecosystem’s complexity and the vari- ability of the workload. It is challenging to design an accurate model for a big data ecosystem that has many interacting components and for workloads with very wide distributions. Traditional performance modelling assumes that data comes from a single SUT and has the same distribution. Both assumptions do not meet the need of big data ecosystems. Big data ecosystems have a complex architecture with several stages, multi-configuration parameters and multi SW elements. These ecosystems contain many highly interactive stages such as com- puting, resource management and a distributed file system which control how the workload is executed, how many resources are allocated to it and where it should be placed, respectively. Each of the controlling decisions impacts the workload’s
+overall performance. Furthermore, the ecosystems have a massive amount of pos- sible configuration parameters. Each of them has multiple possible values and each of the values affects the performance differently.
+The SW elements in big data ecosystems are dependent on each other and some of the elements interact with elements from other ecosystems. For example, the Hadoop resource management element (YARN) [11] is used by many other systems such as Spark [12] and Storm [13]. Also, the Hadoop file system (HDFS) is used by OpenStack Swift and Amazon S3 [14]. The SW characteristics and the interaction have an implication on workload behaviour and therefore workload performance.
+Each aspect of the big data ecosystem architecture impacts the performance of the workloads and can cause a change in workload distributions. It is hard to keep track of how each aspect of the ecosystem impacts performance. As written by [1] “we do not know much about real-life use cases of big data systems at all”.
+Two well-known modelling methods are used for simplifying big data ecosys- tem complexity: white box and black box methods. White box applies when the internal details are essential factors for decision making like considering configu- ration values for configuration tuning [15] or configuration optimization [16]. In contrast, the black box method does not consider the internal ecosystem details, and it is used by most work that focuses on the testing output instead of ecosys- tem details. Most of the black box methods and many of the white box methods follow the original modelling assumption of using a single SUT with the same distribution. Such assumptions would require building a considerable number of models from scratch to cover the possible big data scenarios. The proposed approach in this work benefits from the pre-built models on constructing a new one to improve model accuracy, and save model construction time and resources.
+3.1 Scenario Under Test (SUT) Modelling
+Most performance modelling approaches rely on a single SUT where data is collected from the same environment setups. For example, if the desired test object is an application, then the model is built based on collecting or simulating data from a particular application. Usually, the model built for a particular application cannot work as accurately for another application.
+The performance modelling single SUT requirement is coming from the algo- rithm’s restriction used on the model. The most used algorithms in performance modelling are analytic and Ml algorithms. Both types of algorithms require the trained data and the evaluated data to have the same distributions and feature space. To guarantee those requirements, the performance model expected data needs to come from a single SUT.
+The issue is that most of today’s case studies deal with changing the original scenario for different reasons. The model’s accuracy cannot be guaranteed when any of the SUT factors are changed. For this reason, in most cases, the whole model has to be reconstructed when any change happens. A large number of models are needed to cover all of the possible scenarios.
+Even though a single SUT method gets great attention from both industrial and academic communities, it has several limitations such as lack of supporting diverse scenarios. It requires contracting many models and isolating the built model from the other related models. It consumes time and resources, and is sensitive to workload distributions. A single SUT limitation motivates us to define the cross-scenario method that can support multi-scenarios in big data ecosystems and improve performance model accuracy.
+4 Proposed Approach Overview
+
+Fig.1. Cross-Scenarios transfer performance modelling
+The proposed approach overview is illustrated in Fig. 1 and the procedures are listed below:
+– The examined dataset is Hadoop execution trace-data that is provided by the ALOJA open-access dataset [17]. The dataset has over 16.000 Hadoop executions with various setups like workload type, benchmark type, Hadoop versions, cloud service types and cloud providers.
+– To provide the cross-scenarios transfer method with the correct data, both the Source Scenariosrc and Target Scenariotgt have to follow the same prepa- ration process. For example, the process includes normalizing numeric data, coding categorical data and classifying the target output.
+– Once the dataset is prepared, the Scenariosrc and the Scenariotgt are defined according to the desired hypothesis. For each examined hypothesis, the defi- nition of the Source and Target scenarios are specified in Sect. 5.
+– The Cross-Scenarios transfer method applies for each formulated hypothe- sis. The method contains three steps: build the source model according to Scenariosrc, build the target model according to Scenariotgt , and build the cross-scenarios transfer model according to the built source model and the Scenariotgt.
+– Source and Target models are constructed with Multi-Layer Perceptron (MLP).
+– The built source model knowledge is used to build a cross-scenarios transfer model for the Scenariotgt.
+– The accuracy of results for the target (stand-alone) model and the target (cross- scenarios transfer) are analyzed for each hypothesis.
+– We execute each hypothesis three times to calculate the average result of stand-alone and Transfer Learning models.
+– To study the impact of sample size on the model’s accuracy, we examined each hypothesis with six sample size 50,150,250,350,450,and500 that represents in the experiments as a ratio.
+4.1 Methodology
+Transfer learning is defined to relax distribution similarity constraints on trained and the evaluated data. TL assumes that the trained dataset and the validated dataset have different but related distributions. The TL method can be applied to almost all of the learning models such as classification, regression, and clus- tering. It provides a way to transfer knowledge between different learning tasks or between different domains. There are two types of domains: Source and Tar- get. The Source domain is where the knowledge transfers from and the Target domain is where the knowledge transfers to.
+5 Case Studies and Experimental Result
+In order to evaluate the proposed approach, three different case studies are defined as Hadoop software versions, benchmark types and cloud service types. Each case study contains real-life scenarios that are used to determine the exam- ined cross-scenario transfer.
+5.1 Software Versions
+Commercial and open-source software companies produce new software versions either to add new features or fix the software bugs. This can happen at any stage of the software life cycle. The frequency of producing new versions is in accor- dance with the software design model. In general, open-source software, such as big data ecosystems, release new minor and major versions more repeatedly than commercial software.
+Versions have different configurations and therefore, the trace data that is produced is different in products. The trace-based method is the most used work- load modelling method. Following how versions change is not a straightforward
+Table 1. Experimental results: Hadoop versions hypothesis
+
+Hypothesis
(Hadoop-1.0.3 → Hadoop-1.2.1)
(Hadoop 1 → Hadoop 2)
(Hadoop-1.2.1 → Hadoop-2.7.1)
Sample ratio
Stand-alone
TL
Stand-alone
TL
Stand-alone
TL
10%
0.236 ± 0.043
0.371 ± 0.100
0.270 ± 0.040
0.391 ± 0.017
0.243 ± 0.070
0.278 ± 0.063
30%
0.310 ± 0.035
This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
diff --git a/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt b/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt
new file mode 100644
index 0000000..e3368cd
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt
@@ -0,0 +1,160 @@
+
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+
+Received July 23, 2020, accepted August 2, 2020, date of publication August 7, 2020, date of current version August 20, 2020. Digital Object Identifier 10.1109/ACCESS.2020.3015016
+SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing
+of Large NGS Datasets
+ROBERTO R. EXPÓSITO , ROI GALEGO-TORREIRO, AND JORGE GONZÁLEZ-DOMÍNGUEZ
+Universidade da Coruña, CITIC, Computer Architecture Group, 15071 A Coruña, Spain
+Corresponding author: Roberto R. Expósito (roberto.rey.exposito@udc.es)
+This work was supported in part by the Ministry of Science and Innovation of Spain under Grant TIN2016-75845-P
+and Grant PID2019-104184RB-I00, in part by AEI/FEDER/EU under Grant 10.13039/501100011033, and in part
+by the Xunta de Galicia and FEDER funds (Centro de Investigación de Galicia accreditation 20192022 and
+the Consolidation Program of Competitive Reference Groups) under Grant ED431G 2019/01 and Grant ED431C 2017/04.
+ABSTRACT This paper presents SeQual, a scalable tool to efciently perform quality control of large genomic datasets. Our tool currently supports more than 30 different operations (e.g., ltering, trimming, formatting) that can be applied to DNA/RNA reads in FASTQ/FASTA formats to improve subsequent downstream analyses, while providing a simple and user-friendly graphical interface for non-expert users. Furthermore, SeQual takes full advantage of Big Data technologies to process massive datasets on distributed-memorysystemssuchasclustersbyrelyingontheopen-sourceApacheSparkclustercomputing framework. Our scalable Spark-based implementation allows to reduce the runtime from more than three hours to less than 20 minutes when processing a paired-end dataset with 251 million reads per input le on an 8-node multi-core cluster.
+ INDEX TERMSBigdata,next-generationsequencing(NGS),bioinformatics,qualitycontrol,apachespark.
+I. INTRODUCTION the pipeline. For instance, transforming the input data from The development of Next-Generation Sequencing (NGS) FASTQ to FASTA format may be necessary if any bioinfor- technologies [1], [2] has revolutionized biological research maticsapplicationcanonlyworkwithdatastoredinthelatter over the last decade by drastically decreasing the cost format. Currently, there are several tools to perform quality of DNA/RNA sequencing and signicantly increasing the control andpreprocessing of rawNGS datain order toensure throughput of generated data. The quality of NGS data is the necessary quality for further processing [4], [5]. considered very important for various downstream analyses However, state-of-the-art tools still require excessive time suchasgeneexpressionstudiesandgenomesequenceassem- to process the increasingly large datasets generated through bly [3]. However, NGS platforms introduce, as a downside, mainstream NGS platforms. Although there are some par- different kinds of artefacts in the raw sequence fragments allel tools that allow to accelerate their computations on (theso-called``reads'')suchasduplicates,poor-qualityreads shared-memory systems thanks to including efcient multi- and insertions/deletions, which can lead to serious negative threading support, this is not enough to complete the quality impact on downstream analyses. Therefore, most bioinfor- controlofcurrentlargedatasetsinreasonabletimesincetheir matics pipelines start by applying a quality control over the scalability is limited to the resources of a single machine. input datasets in order to increase the accuracy of subse- In this context, the exploitation of Big Data technologies quent processing. Some examples of these operations are seems an adequate approach in order to accelerate those the removal of duplicate reads, the deletion of reads with calculations on distributed-memory systems such as clus- low average quality, or their transformation to maintain only ters and cloud platforms, as extensively demonstrated by the fragments with high quality (trimming). Moreover, dur- the existing literature [6][8]. In this paper we introduce ing this preprocessing step the datasets sometimes must be SeQual1,ascalabletoolforqualitycontrolandpreprocessing transformed in order to adapt them to the requirements of of raw sequencing data implemented upon the most popular open-source distributed framework for Big Data processing:
+The associate editor coordinating the review of this manuscript and
+approving it for publication was Juan Wang . 1Source code available at https://github.com/roigalegot/SeQual.
+VOLUME 8, 2020 This work is licensed under a Creative Commons Attribution 4.0 License. For more information, see https://creativecommons.org/licenses/by/4.0/ 146075
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+Apache Spark [9]. SeQual is mainly inspired by PRINSEQ [10], one of the most popular tools for quality control which has been widely used in many recent biological studies [11], [12].ThemainadvantagesofPRINSEQoveralternativetools are its simplicity and great functionality, providing support not only for a wide range of quality control operations (such as ltering and trimming), but also for data formatting. Our toolalsoprovidesallthisfunctionality(andevenmore)butin a signicantly lower runtime by fully exploiting the parallel processing capabilities of Spark. Although there are a few parallel tools to remove duplicate DNA/RNA sequences (one specic operation that can be used for quality control) on distributed-memory systems [13], [14], up to our knowledge, SeQual is the rst publicly available tool intended for this typeofparallelsystemsthatprovidesfullfunctionality(more than 30 operations) instead of only allowing to remove dupli- cate reads. Furthermore, SeQual includes a graphical user interface intended for simplifying its usage.
+The remainder of the paper is organized as follows. Section II discusses the related work. Section III describes the overall functionality provided by SeQual. Section IV describes our parallel approach. The performance of SeQual is evaluated and compared to state-of-the-art quality control tools in Section V. Finally, Section VI concludes the paper and proposes future work.
+II. RELATED WORK
+To address the sequencing quality problem, besides the quality control pipeline supplied by some sequencing plat- form manufacturers, several standalone tools have been proposed in the literature. A representative list includestools such as FASTX-Toolkit [15], FastQC [16], PRINSEQ [10], NGS-QC [17], QC-Chain [18], FaQCs [19], Trimmo- matic [20], PEAT [21], AfterQC [22], FastProNGS [23] and PRINSEQCC [24]. With the expected increase in total generated data and decrease in costs associated with NGS technologies, one important concern is their processing speed. Some tools do not provide parallel implementations (FASTX-Toolkit, PRINSEQ), whereas others (FastQC) han- dleparallelismonlyatthelelevel,sotheycannotaccelerate the processing of a very large single dataset. The remaining tools do provide some kind of parallel support but all of them are based on multithreading, so their overall speed is limited to the computational resources of a single machine.
+In terms of functionality, FastQC does not have trimming and ltering features, whereas Trimmomatic is focused on just one operation type (trimming), and PEAT provides very few lter options to the users. FASTX-Toolkit does not even support paired-end datasets, requiring further postprocess- ing to link paired reads. Other tools (FaQCs, FastProNGS) do not support FASTA as input format, while also pro- vide basic user interfaces only limited to command-line interaction. Moreover, there are tools that just seem to be currently unavailable as their websites do not longer work (NGS-QC, QC-Chain). Among all of them, PRINSEQ is by far the solution that provides the widest functionality
+supportingdifferentquality-controlandpreprocessingopera- tions together with a nice web-based graphical user interface. This is the main reason why the functionality of SeQual has been based on PRINSEQ, even extending it. However, the sequential implementation of PRINSEQ using Perl clearly hinders its performance for large datasets, whereas itsmultithreadedCCCversion(PRINSEQCC)ismuchfaster butprovideslessfunctionalitythantheoriginaltool,whileits scalability is still limited to a single machine.
+SeQual tries to combine the functionality and usability of PRINSEQ together with the performance of PRINSEQCC but in a distributed manner relying on Big Data technologies. In fact, the exploitation of Big Data clusters to accelerate the storage, processing and visualization of large NGS datasets has been recently explored in multiple previous works. For instance, many bioinformatics tools implemented on top of Big Data processing frameworks such as Hadoop [25] and Spark [9] have emerged in recent years, from error correction [26], [27], duplicate read removal [13] and sequencealignment[28][31], tovariantcalling[32],denovo genome assembly [33], [34] and protein structure prediction [35][37], among many others. Most of these tools are exe- cutedwithinabioinformaticspipeline(orscienticworkow engines such as SAASFEE [38] or Pegasus [39]) that usually starts with a quality control of the input FASTA/FASTQ datasets. Therefore, they will benet from SeQual in order to accelerate this rst step of the pipeline, which reinforces the need of our proposal in the context of quality control and preprocessing.
+III. OVERVIEW OF SeQual
+SeQual is a parallel tool implemented in Java that currently provides a full set of 33 operations for performing qual- ity control and preprocessing on raw NGS datasets. It can receive as input either single-end or paired-end DNA/RNA sequences, which can be stored either in FASTA or FASTQ les, as these are the most popular unaligned sequence for- mats. The operations provided by SeQual can be divided into the following four main functionalities:
+1) Filters. These operations discard those input reads that do not fulll a certain criteria specied by the user. Filters are divided into two categories, depending on the number of sequences involved in the lter ruleV
+• Single lters, which evaluate reads one-by-one. SeQual includes 12 single lters. For instance, sequencescanbelteredaccordingtotheirlength, quality or the absence/presence of a certain pattern in their bases.
+• Group lters, which compare reads by pairs and discard those that are equal (keeping the one with the highest quality score when possible). SeQual contains 5 group lters that allow, for instance,tocomparethesequencesascomplement or reverse-complement. The user can also specify acertainnumberofallowedmismatchestodiscard those sequences that are almost equal.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+146077
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+
+FIGURE 1. Graphical user interface included with SeQual.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+2) Trimmers. SeQual includes 10 operations in order to trim the beginning or ending of the sequences by removing those bases that are not interesting for the user. The user can specify the number of bases that must remain, or the quality required for the trimmed sequences.
+3) Data formatters. Three functions to convert from DNA to RNA reads (and vice versa) or from FASTQ to FASTA formats are also provided by our tool.
+4) Statistical operations. Finally, SeQual provides three additional functions to obtain some statistics about the initial and/or nal data. For instance, these operations can be used to count the number of input sequences, or to calculate their average length/quality.
+Regarding to the usage of the tool, SeQual provides two execution modesV
+• Through the command-line interface by specifying:
+(1) the path to the dataset(s) as input arguments; (2) the operationstobeperformedonthesedatasetsusingaJava Properties le.
+• Through a graphical interface provided by SeQual in order to simplify its usage to non-computer science experts (see Fig. 1). This graphical interface has been implementedupontheopen-sourceJavaFXproject[40], whichallowsbuilt-inseparationbetweentheapplication logic and the visual part of SeQual.
+It is worth noting that the user can apply multiple operations to the same input dataset in a single execution (see the available check boxes in Fig. 1). In this scenario,
+SeQual implements a priority-based strategy for all lters and trimmers to improve overall performance when multiple ones are selected by the user. Based on their priority, SeQual automatically sorts them to apply rst those lters that can potentially discard more reads and those trimmers that can reduce more their length. This strategy aims to reduce overall runtime as subsequent operations can be accelerated taking advantage of this approach.
+For more details about all the available operations, compilation and execution instructions, as well as a brief overview of the graphical interface, refer to the detailed README le available at the SeQual's website.
+IV. IMPLEMENTATION
+At the highest level of abstraction, the overall workow of SeQual is divided into the following three main stages:
+1) Reading of the input dataset(s) specied by the user, consisting of one or two FASTQ/FASTA text-based sequence les when working in single- or paired-end mode, respectively.
+2) Processing of the input les according to the quality-control operations selected by the user in the graphical interface or, otherwise, specied in a Properties le when using the command-line interface.
+3) Writingoftheprocesseddataset(s)totheircorrespond- ing output text les as a result of the computations previously performed.
+In order to understand how these stages have been imple- mentedontopofSpark(SectionsIV-BandIV-C),somebasic
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+
+FIGURE 2. Spark example of combining map/filter transformations and count action over an RDD of type Integer.
+
+FIGURE 3. Example of two DNA reads in FASTQ format (100 base pairs).
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+concepts about the programming model provided by this Big Data framework need rst to be introduced (SectionIV-A).
+A. APACHE SPARK
+Spark [9] is a popular Big Data processing framework that supports efcient in-memory computations by relying on a novel, distributed data abstraction known as Resilient Dis- tributed Dataset (RDD) [41]. Basically, an RDD is a par- titioned collection of data elements that can be distributed across the nodes of a commodity cluster. One important feature of RDDs is that their partitions can be operated in parallel and cached in memory to be reused in subsequent MapReduce-like operations [42]. A Spark programmer can create an RDD in two different ways: either by parallelizing an existing collection of objects (e.g., a list); or by loadingan external dataset from a supported le system. In order to allowdataprocessinginadistributedmanner,Sparkprovides support for the Hadoop Distributed File System (HDFS) [43] so that RDDs can be created and efciently processed from datasets stored in it. Nowadays, HDFS is considered the mostpopularopen-sourcedistributedlesystemforBigData processing, providing the fundamental storage layer within the Hadoop ecosystem [25].
+The RDD programming API provided by Spark supports a wide range of data-parallel operations that can be performed over an RDD. Those operations can be divided into trans- formations and actions. On the one hand, transformations (e.g., map, lter, join) create a new RDD from an exist- ing one. For instance, a map transformation processes each RDD element through a user-dened function, returning a new RDD as result. Another example is lter, which returns a new RDD formed by selecting only those elements of the source RDD on which a user-dened function returns true. Note that transformations are lazily evaluated in Spark, so they do not compute anything until an action that requires the result from them is triggered. On the other hand, actions return non-RDD values, converting the laziness of transfor- mations into actual computation. Actions can be used to either return a result to the main Spark program (e.g., reduce, collect, count), or to store an RDD in external storage after running a certain computation (e.g., saveAsTextFile,
+saveAsObjectFile).Forinstance,thereduceactionaggregates all the RDD elements according to a user-dened function and returns the nal result to the main program. As an illus- trative example, Fig. 2 shows the chaining of a map and lter transformations together with a count action over an RDD oftypeInteger.Notethattheuser-denedfunctionsexecuted overtheinputRDDareshownbelowthecorrespondingboxes for map and ltertransformations.
+Finally,anotherinterestingfeatureofSparkisthatitallows to explicitly cache or persist the RDD elements in memory, thus providing much faster access to them the next time they are queried. This is extremely useful for implementing efcient iterative algorithms [44].
+B. RDD MANAGEMENT IN SeQual
+All the RDD objects managed by SeQual are created from the input datasets stored in HDFS, which represents the rst stage of the overall workow previously described. The most straightforward way to create an RDD from an input text le stored in HDFS would be using thetextFile method provided by Spark. Unfortunately, this method is not able to handle properly the specic structure of the FASTQ/FASTA text-based le formats, as both involve mul- tiplelinespersequence(e.g.,fourlinesforFASTQ,asshown intheexampleofFig.3).ThisSparkmethodreliesbydefault on newline characters to identify the individual records in the input le (i.e., it creates one input record per line). Although it is possible to change the default delimiter to separate individual records according to the sequence format (e.g., FASTQ reads begin with character `@'), this solution would not work since such character can also occur in the string that represents the quality scores associated with each base (qualities are stored in the fourth line of each FASTQ read, as shown in Fig. 3).
+To overcome such issues, other previous bioinformatics tools implemented using Big Data technologies [28], [45] generallyperformapreprocessingoftheinputlestoconvert them into the required line-by-line format (i.e., one read per line). Next, the converted les are copied to HDFS to be processed. In the specic case of Spark, another solution is to create the RDD using the previous textFile method
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+146079
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+
+FIGURE 4. SeQual example of combining DNATORNA and TRIMLEFT operations.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+and then operate over it with additional transformations and actions to obtain the desired format [29]. However, those approaches incur additional disk/memory overheads, degrad- ing the overall performance. Instead, SeQual relies on the Hadoop Sequence Parser (HSP) library [46] to create the input RDDs in order to avoid any additional preprocess- ing/transformation of the input les. HSP is a Java-based library that provides specic and optimized routines to parse FASTQ/FASTA les directly from HDFS, and it is cur- rently compatible with Hadoop, Spark and Flink [47] data processing frameworks.
+Once the input RDDs are created using the HSP library (rst stage), the transformations and actions provided by the Spark's API can process their partitions during the second stage according to the quality-control operations specied by the user, as will be explained in the next subsection. Finally, the RDDs resulting from performing those operations are written back to HDFS by SeQual to create the output les (third stage). In this case, Spark provides a suitable RDD action (saveAsTextFile) to do so straightforwardly.
+C. SPARK-BASED QUALITY CONTROL AND PREPROCESSING
+To efciently implement all the functionality provided by SeQual (see Section III), each supported quality operation must be translated into the appropriate combination of trans- formations/actions to be performed over the input RDDs which have been previously created using the HSP library.
+Regarding to single lters, these operations were imple- mented using an RDD ltertransformation, as they evaluate input reads one-by-one. As mentioned before, this transfor- mation returns a new RDD that contains only those elements of the input RDD on which a user-dened function returns true.So,theimplementationofeachsinglelterprovidestwo functions for single- and paired-end mode, and their specic logic depends on the rule used to lter out sequences. For instance, the LENGTH lter compares the length of each read(i.e.,thenumberofbases)withaminimumormaximum threshold specied by the user, returning false when the read must be ltered out from the resulting RDD and true otherwise.
+Group lters represent a much more complex computa- tion as input reads are compared by pairs. For instance, the DISTINCT lter requires to check all read pairs in order to remove duplicated sequences. These group lters rst gener- ateaPairRDD,whichisanRDDconsistingofkey/valuepairs
+as elements. To do so, these operations apply a mapToPair transformation to the input RDD, which is similar to map but itallowsreturningaPairRDD.Thefunctionexecutedbymap- ToPairoutputsaskeyastringthatrepresentsthebasesofeach read for the DISTINCT lter (or the reverse, complementary or reverse complementary if the lter requires so). As value, the function outputs the sequence object itself, which con- tains not only the bases but also the sequence identier and the qualities (if available). Once this PairRDD is created, a reduceByKey action is applied over it so that all the values (i.e., sequences) for each key are aggregated and then reduced based on a given user-dened function. The reduce function simply discards one of these similar sequences, keeping the one with the highest quality score (if available). Note that the group lters are consid- ered network-intensive operations as the reduceByKey action requirestoshufedataoverthenetworkinordertoaggregate all the values for the same key.
+The implementation of trimmers and data formatters both rely on applying a single map transformation over the input RDD, performing the appropriate modications to each read depending on the specic operation. For instance, the func- tion executed by the map transformation in the case of TRIMLEFT (operation that removes a number of bases spec- ied by the user starting from the left) modies the string that represents the bases for each read using the substring Java method. Such modications must also be performed on the string that represent the quality scores when avail- able. An example of a data formatter is DNATORNA, whose function executed by map replaces each thymine base from the input DNA reads (represented by a `T' character) by its corresponding uracil counterpart (a `U' character) in the out- put RNA reads, using the replace method provided by Java. As a representative example, Fig. 4 shows the combination of both operations (DNATORNA and TRIMLEFT) over an input RDD containing four DNA reads.
+Finally,theimplementationofthedifferentstatisticaloper- ations differ greatly. The COUNT operation was straightfor- ward to implement as it takes advantage of the count action provided by Spark that returns the number of RDD elements (i.e., sequences) in the dataset. However, the remaining two operations(MEANLENGTHandMEANQUALITY)require a more complex approach, being very similar for both of them.Toimplementthosefunctions,theaggregateactionwas selected. This action allows operating an RDD to generate a single nal result that can be of a different type than that
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+TABLE 1. Cluster node characteristics. TABLE 2. Main configuration parameters of Spark and HDFS.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+
+of the input RDD. To do so, the aggregate action takes two user-dened functions as arguments. The rst one operates once for each RDD element in a partition, so it is used to accumulate the results for each RDD. The second function combines all the intermediate results (one result per RDD partition) to produce the nal result that is nally returned to the main program. For instance, the rst function for MEANQUALITY computes the number of reads in each partition and the accumulated quality for all of them, while the second function combines all the accumulated qualities andnumberofreadsforallthepartitions.Next,thenalresult (i.e.,themeanquality)issimplyobtainedbydividingthetotal quality score by the total number of reads.
+V. PERFORMANCE EVALUATION
+The correctness of the results provided by SeQual has been assessed by checking that it provides the same outputs as PRINSEQ (a widely used and tested tool) when applying identical operations over the same input datasets. Therefore, the experimental evaluation has only focused on execution time. In order to check the correctness of the statistics (notavailableinthestate-of-the-arttools),wehavecompared the outputs of SeQual to the statistics provided by some text editors about the total number of lines and characters in the output les.
+To evaluate the performance of SeQual, an eight-node multi-core cluster has been used for the experimental eval- uation. Table 1 shows the main hardware and software characteristics of each cluster node, which mainly consists of two Intel Xeon E5-2660 octa-core Sandy Bridge-EP processors at 2.2 GHz (i.e., 16 physical cores per node), 64 GiB of memory and one local disk intended to be used for both HDFS and intermediate data storage during the execution of the experiments. The cluster nodes are inter- connected through Gigabit Ethernet (1 Gbps) and Inni- Band FDR (56 Gbps). The system runs Linux CentOS release7.7.1908withkernel3.10.0-1062andtheJavaversion
+
+is Oracle JRE 1.8.0_241. According to these characteris- tics, Apache Spark version 2.4.4 was congured as shown in Table 2, which also contains the main relevant congu- ration parameters for HDFS (i.e., block size and replication factor).TheversionofHadoopdeployedintheclustertostore the input datasets in HDFS was 2.9.2. We have compared SeQual with PRINSEQ [10], one of the most popular quality control tools (see Section II), together with its multithreaded counterpart PRINSEQCC [24], using the latest available version of both tools. PRINSEQ was executed with Perl v5.16.3, whereas PRINSEQCC was compiled with GNU GCC v8.3.0 using the -O3 optimization ag.
+Two publicly available datasets in FASTQ format obtained from the Sequence Read Archive (SRA) [48], [49] of the National Center for Biotechnology Information (NCBI) [50], [51] were used for the performance evalu- ation: SRR534301 and SRR567455. Table 3 shows their main characteristics. The number of reads (fourth column in the table) refers to the number of sequences per input le contained in the dataset, whereas the read length (fth column)isexpressedintermsofthenumberofbasepairs(bp) per sequence. We have selected these datasets as they repre- sent two different scenarios in terms of size and read lengths.
+Table 4 shows the runtimes of PRINSEQ, PRINSEQCC and SeQual when processing those datasets both in single- and paired-end modes (i.e., processing one or two input les, respectively) for the following six representative operations:
+• NONIUPAC:singleltertoremovethosereadswithone or more Non-IUPAC bases (any base other than `A', `T', `G', `C' or `N').
+• GCCONTENT: single lter to remove those reads with a percentage of Guanine (`G') and Cytosine (`C') lower or higher than a threshold specied by the user.
+• DISTINCT: group lter to remove duplicate reads maintaining the ones with the highest quality.
+• DNATORNA: data formatter to convert from DNA to RNA reads.
+• COUNT: statistical operation to count the total number of reads in the dataset before and after performing any other operation over it.
+• MEANQUALITY: statistical operation to compute the averagequalityofallthesequencesavailableintheinput dataset.
+We have not assessed the performance of complex jobs that combine several operations in order to keep this section easy to read. Nevertheless, the improvement of SeQual over PRINSEQ and PRINSEQCC in this type of jobs would be at least the addition of the performance improvement in the individual operations. Note also that Table 4 shows
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+146081
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+TABLE 3. Public datasets used in the experimental evaluation.
+
+TABLE 4. Runtimes (in seconds) for PRINSEQ (using one core), PRINSEQCC (using one whole node, 16 cores) and SeQual (using 16 cores in one node and 128 cores in eight nodes) when performing different operations on two different datasets in single- and paired-end modes. Operations not available in PRINSEQ and PRINSEQCC are indicated with ` '.
+
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+
+R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets
+two runtime results for SeQual: using one whole node (i.e., 16 cores) and the eight nodes of the cluster (128 cores in total). PRINSEQCC was executed on the 16 cores of one whole node, while PRINSEQ only used one core, as it is a sequential tool. Statistical operations could not be com- pared as they are not available neither in PRINSEQ nor in PRINSEQCC.Moreover,PRINSEQCC doesnotprovidethe DNATORNA formatter.
+As can be observed, SeQual is signicantly faster than the original tool PRINSEQ in all the scenarios even using only one node. When comparing SeQual with the multithreaded version (i.e., PRINSEQCC) using the same amount of hard- wareresources(i.e.,onewholenode),SeQualisfasterforhalf of the scenarios (it depends on the dataset and/or the opera- tion).Forinstance,SeQualisfasterthanPRINSEQCC forall the single-end experiments. Nevertheless, the main benet of implementing SeQual upon a cluster computing framework such as Spark is the possibility of exploiting the performance of multiple nodes in order to reduce even more the exe- cution time. When exploiting the whole cluster (8 nodes), SeQual is signicantly faster than PRINSEQCC for all the scenarios. More specically, our tool is on average around
+23.6 and 8.3 times faster than PRINSEQ and PRINSEQCC, respectively, providing signicant speedups of up to 41.5x and 12.4x (both results achieved for the GCCONTENT lter operation when processing the SRR56 dataset). It is worth noting that the performance comparison has been limited to PRINSEQ and PRINSEQCC as, up to our knowledge, these are the tools of the current state of the art with the widest functionality(although,ascanbeseeninTable4,SeQualpro- vides even more operations). We have not compared to other tools such as Trimmomatic [20] as the number of operations that they offer is quite limited, and therefore in our opinion theirfunctionalityisnotcomparabletothatofSeQualoreven PRINSEQ. For instance, none of the operations that have been assessed in this experimental evaluation are available in Trimmomatic.
+In order to measure the scalability provided by the Spark-based implementation included in SeQual, Fig. 5 reports the speedups obtained when varying the number of nodes from one to eight. The baseline is the execution time of SeQual for each operation when using one whole node, i.e., the speedups show the acceleration obtained thanks to exploitingmultiplenodescomparedtousingonlyone.Ascan
+
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+VOLUME 8, 2020
+146083
diff --git a/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt b/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt
new file mode 100644
index 0000000..3335519
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt
@@ -0,0 +1,125 @@
+International Journal of Recent Technology and Engineering (IJRTE)
+ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+A Big Data Framework for Quality Assurance and Validation
+S. Nachiyappan, Justus S
+ depends purely on format. It can be in any structured or Abstract: Big data is a new technology, which is defined by unstructured format or it can be also a corrupted file. The data
+large amount of data, so it is possible to extract value from the which are collected from the various sources like social media capturing and analysis process. Large data faced many challenges and digital media will be constructive and structured.It is dcoume ptolexvaitrioyauns d fepearfoturerms asuch nce. Mas anvoyluorgmae,nspizaetieodn, s vafariaceticohna,llveanlugees , tough to analyze the types of data. There are many types of
+while facing test strategies for structured and unstructured data data like we categorize under structure and unstructured. It is validation, establishing a proper testing environment, working very difficult to analyze all types of dataThere are some with non relational databases and maintaining functional testing. flexible solutions for DBMS and RDBMS such as Oracle.
+These challenges have low quality data in production, delay in The RDBMS is used for structured query language or SQL to execution and increase in cost. Reduce the map for data intensive manage, define, query, and update data. However, suppose business and scientific applications Provides parallel and scalable
+programming model. To get the performance of big data data size is irresistible, it seems that RDBMS can handle hard, applications, defined as response time, maximum online user data and if done, the process becomes more expensive. It proves capacity size, and a certain maximum processing capacity. In that relational databases are not capable of managing large
+proposed, to test the health care big data . In health care data data and some new technologies are needed for processing the contains text file, image file, audio file and video file. To test the data. Customary databases are accurate for structured data bpigre pdroactaessdinocgutesmetinnt,g abny dupsinost gprotwocesscoinngc etespts tinsuch g. Toacs labssigify dathtae and not for unstructured data. Big data contains the three
+data from unstructured format to structured format using SVM characteristics such as volume/variety and velocity always algorithm. In preprocessing testing test all the data, for the called as 3V’s.Volume refers to an algorithm ability to deal purpose data accuracy. In preprocessing testing such as file size with a large amount of data. The scale of the data set is the
+testing, file extension testing and de-duplication testing. In Post quantity for the clustering algorithms related to volume Proeasily tcessoinfegtch to thimepdlematae. nt the map reduce concept for the use of property, the higher the size, the handling outlines. The data
+set is a collection of data set properties. Classification of
+Index Terms: Preprocessing, Map reduce in Post Processing, features, nominal, ordinal, interval and ratio. Many clustering Structured data using SVM. algorithms support numerical and classification data. In large quantities, the size of the data set increases to maintain large
+I. INTRODUCTION data, and the dimensions do not even increase. It's a curse of
+ Big data is new forms of information processing that size. In many clustering algorithms are capable of performing promotes large volume, high Speed with communication setbacks. Noise data can be grouped with data points. Variety assets, improved awareness, cost effective, decision making indicates the ability of a clustering algorithm to perform and process automation. Data represented large quantities is various sets of data sets, such as numerical, classification, nothing but Big Data. True, there is no specific size parameter nominal and ordinal. A criterion for clustering algorithms is a that defines this technology size. This is the safe way to set of data and cluster shape type. The size of the data set is measure the standard route of terabytes even pet bytes. The smaller or larger, but clustering algorithms support larger data data travels from various directions, and the speed and sets for large data mining. In cluster shape, the set of data volume will be terrible. Data will be replaced at a faster pace cluster is based on size and type shape. Velocity refers to the and therefore require more processing, especially for social calculation algorithm's calculations based on the complexity media feeds. But it is not the only medium to get information. of the time period of the clustering algorithm. If the It comes from different sources and shapes. If you go through algorithm's calculations are too low, nothing algorithm has the data you can find text files, audio files, images, video files, less run time. The algorithms run based on the Big O Option. presentations, sensor datas, data bases and log files. It The Artificial Neural Network algorithm is based on a cognitive approach, namely, a neural network without the
+hidden layer. Although this approach could lead to poor quality in classification, it was easily selected for construction. As with the SVM model we created a perception classification for each binary combination. A node has an input layer of a node for classification. Perception has an output layer that represents a number of two categories that
+
Revised Manuscript Received on 30 July 2019.
* Correspondence Author
Nachiyappan S*, Assistant Prof (Sr.), SCSE, VIT University, Chennai.
Justus S, Associate Professor, SCSE, VIT University, Chennai.
© The Authors. Published by Blue Eyes Intelligence Engineering and
Sciences Publication (BEIESP). This is an open access article under the
CC-BY-NC-ND license http://creativecommons.org/licenses/by-nc-nd/4.0/
+belong to an example given
+either 0 or a 1.
+Using the full feature set rules for input layer increases the
+computation, but stabilizes the feature set for comparison with Big Data is defined as datasets whose size is very huge and it the SVM algorithm. cannot be adopted in a traditional database tools to do all the
+data processing. This is a specific definition which defines big
+II. RELATED WORK data in terms of its context not the metric. This was discussed in Mckinsey’s report 2011 NIST has defined big data in some
+BdepigenDatads udpooes n itsno t feameatunres thanat dit it is isa vderiffyerlarengtiatede volubmyethoe fd“Verata it y other way like “ big data is where the data acquisition data
+volume and velocity or variety of data limits the ability to larbigge data data”in anliterd “atuhurge e andata”d th.erTe herare e arsoe mme andyefindefitioinitions wnshichfor perform the analysis on data. There are certain limitations that
+plays a very important role. Big Data is Defined by IDC in which are needs to be addressed before processing it”. There 2011 : “Big data technologies describe a new generation of is also some other definitions which states that“software technologies and architectures, designed to economically libraries along with their associated algorithms that enable extract value from very large volumes of a wide variety of distributed processing and analysis of big data problems data, by enabling high-velocity capture, discovery, and/or across clusters of computer units” [1].
+analysis.''[1]. This explains the four characters or four V’s of
+Big data. Volume, Variety, Velocity and Veracity of data.
+
+Fig1. Big Data Validation Service
+There ia s work which is carried out by an industry regarding
+big data testing, They have used the Big Data services for III. METHODDOLOGY
+each and every V’s. Here four types of testing’s are done first
+is to test the velocity, when the data comes inside the system A. File Categorization using SVM Algorithm
+or storage the rate of speed which it is extracting and loading The file classification is a function that automatically into target system. Second one is the volume testing which separates the set of file extension from the classification from tests the amount of data in which the map reduce algorithms the predefined set. The concept of file classification is a are used in specific to their business needs. Third one is the standardized number of predefined categories or fractions. variety of data where the type of data is important to File classification can be defined as a function of differentiate like structured or unstructured. If its unstructured automatically classifying electronic documents for their data then the data has to be processed and it has to be commenting classes based on their file extension. Each converted into a structured format to process it. Fourth one is document is not exactly one, multiple or category. Using veracity of data where the truthiness of data is going to be the machine learning, learning classifications of targets, and very important part as the validation and verification is automating those classifications automatically. This is a concern. Fig1. Shows the big data validation services and how learning problem overseeing. Due to the overlapping of it is going to be processed. categories, each category is considered a separate binary
+classification problem.
+Classification helps to identify the correct category of extension and store it on the server. In this process we must domain in use, in this section I decided to divide the cloud file use the SVM algorithm. SVM Algorithm Main concept into four categories related to a particular file, which is split classification
+into an image file, video file, text file, and document file. For
+extraction. Then get the extension and classify the file
+
+Fig 2: Overview of Big data testing
+File size and File extension Testing
+A. De-duplication in Preprocessing Testing File size and file extension is the one of the pre process In big data preprocessing technique, we've got to check the testing. Data has been collected from varied sources and when de-duplication, zero file size, then the file extension. In collection information the info the information set and de-duplication testing ,To transfer file the user and also the uploading the data into the big information system and before CSP perform each de-duplications. The de-duplication process it, to validate the file is empty or not. If the file size is operation is a twin of that within the baseline approach. zero the file is not uploaded into the cloud server. Then the additional exactly, the user sends the file tag to the CSP for File extension validation helps us in many ways to confine the the file duplicate check. If a file duplicate is found, the user extension of file. In the file extension validation, to test the can run the POW protocol POWF with the CSP to prove the file size limit. For example, the image file contains some limit, file possession. If no duplicate exists, CSP stores the cipher if the size is exceeds it is not uploaded into the cloud
+rtext with key and returns the corresponding pointers back to
+user for native storage. In de-duplication on the opposite hand B. Map Reduce in Post Processing
+of keeping the multiple information copies with an equivalent Map reduce is that this programming paradigm that enables file content, de-duplication eliminates recurrent information for large scalability across a whole lot or thousands of servers by keeping solely single copy and referring alternative in a very big data cluster. The Map reduce is straightforward redundant information thereto single copy. The to grasp for those that area unit acquainted with clustered de-duplication to eliminates duplicate copies of an equivalent scale-out data processing solutions.
+file. De-duplication also can be used at the block level, that
+eliminates duplicate blocks of information that occur in non
+identical files.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Retrieval Number: A1912058119/19©BEIESP Published By:
+DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org 2493 Sciences Publication
+International Journal of Recent Technology and Engineering (IJRTE)
+ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019
+ Map-Reduce Validation represent the checking of key-value pairs generation and validate the map-reduce by applying numerous business rules. The term Map reduce truly refers to 2 separate and distinct tasks that big data programs perform. the primary is that the map job, that takes a group of knowledge and converts it into another set of knowledge, wherever individual components area unit countermined into rows (key/value pairs). The scale back job takes the output from a map as input and combines those information rows into a smaller set of rows. In map scale back, the scale back job is often performed once the map job. The Health Care big data area unit hold on within the server. Within the user will fetch information quickly we've to use the map scale back.
+Table 1. Quality Attributes of Big Data
+S.N
+o
Quality Variable
Explanation
1
Data correctness
The correctness of the data is validated with respect to format and data types.
2
Data consistency
This validated the data consistency in various angles it also refers to data gathering from various locations.
3
Data accuracy
This refers to closeness between the actual result and the expected result. Data from various sources are gathered and measured for its accuracy.
4
Data security
Security is one if the important concern which need to be addressed and validated for the applications security and its integrity in various perspectives
III. TEST PROCEDURE
+In addition the quality factors which are discussed in this paper are as follows:
+Reliability:
+This assures the reliability of the big data applications under some specific conditions how the system is going to perform. When a specific load is given to the system how it behaves. Performance: How the big data applications performs in specific conditions and its also indicates about the performance of big data apps, such as availability and response time.
+Correctness:
+This speaks about the rightness of the big data applications. Scalability:
+Scalability is the factor which speaks about the applications flexibility to scale. In some situations it should support to scale some huge data and huge repositories and storages from period to period. In the same way that the applications scalability should be tested for its purpose.
+Security:
+The validation of security regarding the big data application is done here at different stages.
+IV. RESULT
+A. Data Accuracy
+Data Quality is one of the important factor which needs to be considered when we go for any testing the first one we need to discus is data accuracy. Data accuracy is the important factor of data quality. It is the data stored in that field is correct or not. In this implementation the medical data set of sample 100000 records are taken as the test data set.
+In data accuracy is higher when compare to preprocessing. After the pretesting the each cluster provides the correct accurate result. Before preprocessing the data is stored in unstructured format after preprocessing the data is formed in to structured data and its formed into different clusters. Cluster type such as image, video, document and text.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Retrieval Number: A1912058119/19©BEIESP Published By:
+DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication
+International Journal of Recent Technology and Engineering (IJRTE)
+ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019
+When the Quality challenges for Big data is being discussed the data quality of applications are also considered. The Quality variables of enormous information applications were secret nowadays. Traditional quality factors following robustness, performance and security can be valid in big data. Now coming to big data validations and the quality challenges this work discuss about the quality and validation process of big data. On comparing to customary software testing with the big data application testing process is entirely different and they are discussed in this paper in a brief manner.
+The test procedure for big data is as follows.
+1) Functional testing of big data, which includes rich test environments and domain-specific functions;
+2) Non-function testing, includes performance, reliability, portability, Security, system consistency and Quality of Service
+3) Big data Timing testing, checks timeliness of the system; Fig 3: Data Accuracy
+4) Big Data feature testing, targets user related system
+evolution and visualization
+These four steps are followed in testing the big data
+applications and feature testing which includes testing
+continuously with real time testing.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Retrieval Number: A1912058119/19©BEIESP Published By:
+DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication
+International Journal of Recent Technology and Engineering (IJRTE)
+ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019
+B. Data volume
+In data volume, each cluster takes more storage space before pretesting. After that implementation of the pre testing the size of the data has been reduced. By means of de-duplication testing the duplicate data has been removed and the storage space has been reduced far better than before preprocessing. Because of the remove duplicate data, null value data and file categorization the storage space becomes low in each cluster.
+
+7. Quality Assurance for Big Data Applications – Issues, Challenges and Needs – Chuanqi Taq, Jearry Gao. 2016.
+8. A Survey on Quality assurance techniques for big data applications, Pengcheng zhang, Xuewu Zhou, Jerry Gao, Chuanqi Tao. 2017.
+9. Big Data - Testing Approach to Overcome Quality Challenges – Infosys White paper – Vol 11 no 1- 2013.
+10. Big Data Testing Services, Infosys white paper – 2015
+AUTHORS PROFILE
+ Prof. S. Nachiyappan is working in VIT University Chennai campus, Completed his PG in Anna university in 2004 and his area of research is software engineering and Big Data. He is having 5 years of Industry Experience and 10 + Years of teaching experience. He is a member of ACM professional Chapter.
+Dr. S. Justus Worked in various industries as project manager and researcher, he has an over all experience of 17+ years in both IT and Academic. He has guided more than 15 PG students for the project and has published various papers in national and international journals. He is a member of ISTE, IEEE, IAENG.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Retrieval Number: A1912058119/19©BEIESP Published By:
+DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication
+International Journal of Recent Technology and Engineering (IJRTE)
+ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019
+Fig. 4: Data Volume
+V. CONCLUSION
+Big data information is as yet advancing and analyzers and testers have a huge duty to recognize new thoughts for performing tests in the field of Big Data. A standout amongst the most testing things for an testers is to keep the pace with industry's evolving elements. In many aspects of the test, technical details behind the tester scene are unknown, but testing of Big Data Technology is quite different. There is no need to be strong in a Tester Fundamentals test, but in order to analyze many performance barriers and other problems, you need to know the minute details in the design of database designs. Big data testers should first learn parts of the big data Eco System. In this paper 10000 sample data is used entered big data in the same cluster mode. We turn out with two preprocess and post process testing results. The future work in this is to test information with numerous group frameworks.
+ We have to give the more accurate result by using different algorithms.
+REFERENCES
+1. Avita Katal, Mohammad Wazid, R H Goudar, “Big Data: Issues, Challenges, Tools and Good Practices”, IEEE, 2013.
+2. Xiaoming Gao, Judy Qiu, “Supporting Queries and Analyses of Large-Scale Social Media Data with Customizable and Scalable Indexing Techniques over NoSQL Databases”, 14th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, 2014.
+3. Matthew Smith, Christian Szongott, Benjamin Henne, Gabriele von Voigt, “Big Data Privacy Issues in Public Social Media”, IEEE, 6th International Conference on Digital Ecosystems Technologies (DEST), 18-20 June 2012.
+4. Vapnik (1995), The Nature of Statistical Learning Theory. Springer, Berlin
+5. Burges, C.J.C. (1996). Simplified Support Vector Decision Rules. 13th International Conference on Machine Learning.
+6. Pengcheng Zhang1, Xuewu Zhou1, Wenrui Li2, Jerry Gao3,4 (2017) A survey on quality assurance techniques for big data applications.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Retrieval Number: A1912058119/19©BEIESP Published By:
+DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org 2495 Sciences Publication
diff --git a/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt b/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt
new file mode 100644
index 0000000..e12b618
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt
@@ -0,0 +1,203 @@
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1081
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+An Improvement of a Checkpoint-based Distributed Testing Technique
+on a Big Data Environment
+Bhuridech Sudsee, Chanwit Kaewkasi
+School of Computer Engineering
+Suranaree University of Technology, Nakhon Ratchasrima, Thailand, 30000 m5741861@g.sut.ac.th, chanwit@sut.ac.th
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1086
+
+Abstract— The advancement of storage technologies and the fast-growing number of generated data have made the world moved into the Big Data era. In this past, we had many data mining tools but they are inadequate to process Data-Intensive Scalable Computing workloads. The Apache Spark framework is a popular tool designed for Big Data processing. It leverages in-memory processing techniques that make Spark up to 100 times faster than Hadoop. Testing this kind of Big Data program is time consuming. Unfortunately, developers lack a proper testing framework, which cloud help assure quality of their data-intensive processing programs while saving development time and storage usages.
+We propose Distributed Test Checkpointing (DTC) for Apache Spark. DTC applies unit testing to the Big Data software development life cycle and reduce time spent for each testing loop with checkpoint. By using checkpoint technique, DTC keeps quality of Big Data processing software while keeps an inexpensive testing cost by overriding original Spark mechanism so that developers no pain to learn how to use DTC. Moreover, DTC has no addition abstraction layers. Developers can upgrade to a new version of Spark seamlessly. From the experimental results, we found that in the subsequence rounds of unit testing, DTC dramatically speed the testing time up to 450-500% faster. In case of storage, DTC can cut unnecessary data off and make the storage 19.7 times saver than the original checkpoint of Spark. DTC can be used either in case of JVM termination or testing with random values.
+Keyword— Distributed Checkpointing; Apache Spark; Big Data Testing; Software Testing;
+I. INTRODUCTION
+THseEnsorsinc,reIoTasi ngdeviacnd es adind verstheity faof st-growelecitng roninumc debevirsc eof s, Internet users have been generating tremendous amount of
+data recently. They are not only the large amount of data
+——————————————————————— Manuscript received December 27th, 2017. This work was supported by Suranaree University of Technology, and a follow-up of the invited journal to the accepted & presented paper of the 20th International Conference on Advanced Communication Technology (ICACT2018),
+Bhuridech Sudsee is with School of Computer Engineering, Suranaree University of Technology, Nakhon Ratchasrima, Thailand (corresponding author phone: +66-44-22-4422; e-mail: m5741861@g.sut.ac.th).
+Chanwit Kaewkasi is with School of Computer Engineering, Suranaree University of Technology, Nakhon Ratchasrima, Thailand (e-mail: chanwit@sut.ac.th).
+but their structures are also complex as well. This complexity makes the traditional data mining tools inadequate to manage today’s data [1].
+The MapReduce [2] programming model has induced the development of many frameworks such as Apache Hadoop [4], Map-reduce-merge [5] and Apache Spark [6], which aim to process data intensive tasks. Developers only need to rewrite their programming logic in the form of map and reduce functions in order to process data on a MapReduce framework. These functions will be automatically managed by the framework’s default configuration. This mechanism makes the MapReduce framework easy to use. At its simplest form, a MapReduce program usually starts by a map function creating key/value pairs from the input. These intermediate key/value pairs are then passed to a reduce function to produce the final results. The MapReduce model is parallel by nature. It is designed to allow developers to run MapReduce programs for high performance computing jobs using a commodity cluster, built from low-cost hardwares. With this kind of the cluster architecture, we can handle massive amount of data and process them on numerous cluster nodes without a single point of failure [3].
+Although the MapReduce model is easy to use for software development, but it is quite tricky to test software written by the MapReduce model. Software testing is a vital part of the development process. Testing is usually 25-50% of the overall cost [8]. We found that the current mechanism is not enough to assure quality for Big Data processing programs. Unit testing is a software testing technique which properly leads to better levels of quality. However, tools like Scalatest[9] or jUnit[10] have their own limitations to use with a MapReduce framework like Spark. For example, SparkContext and SparkSession objects must be instantiated only once for each running Java Virtual Machine (JVM) to avoid unexpected testing results [12]. Spark-testing-base [11] also does not have a testing mechanism for Spark. Without modification, it cannot work on a Spark cluster because if its inability to distribute class files across worker nodes. There aforementioned techniques are not suitable for Spark simply because they are not designed to test programs that distributelly process large amount of data.
+Test-driven development (TDD) is a software development technique that helps developers to focus on
+writing a specific test at a time. It additionally allows code improvement while preserving correctness according to the specification. TDD workflow consists of the following steps, (1) writing a minimum test (2) writing codes to just make the test passed, and (3) refactoring to remove unnecessary codes while still making the current test passed [13]. We call these steps a TDD workflow herein this paper. Applying TDD to data intensive programs is difficult due to the nature of workloads, which need to process on a cluster. So, developers require a special tool to help shorten each loop of the TDD workflow.
+Spark has cache, persist and checkpoint methods to help mitigate job failure. These mechanisms however do not help software testing process much. The main reason is that a cluster state cached or persisted by them does not survive across runs of JVMs. A cluster state saved by the checkpoint method does survive on disk but unfortunately it cannot be retrieved back by a newly started JVM [14, 15].
+In this paper, we present Distributed Test Checkpointing (DTC), a technique that leverages the checkpoint technique to enhance software testing for data intensive jobs. With DTC, developers can increase productivity when testing their software on a distributed cluster repeatedly. DTC applied a hash function on each data partition of a Resilient Distributed Datasets (RDD) [18] to use an identifier. Modification of an RDD or a Dataset can be traced by the hashed number. The testcase that uses the RDD is also hashed at the bytecode level. Combining these techniques, DTC is found to reduce testing time and storage required by checkpointing significantly compared to the original Spark’s checkpointing technique.
+The remaining of this paper is organized as followed. Section II discusses related works, including Apache Spark. Section III presents the design and internal mechanism of DTC. Section IV presents the system architecture of the cluster used by our experiments, and the experimental results. This paper then ends with conclusion and future works in Section V.
+II. BACKGROUND AND RELATED WORK
+A. Apache Spark
+Spark is a data intensive processing framework focusing on in-memory data processing [6], which is implemented in the form of Resilient Distributed Dataset (RDD) [18]. RDD is designed to take care of the data flow and handle the processing mechanism. An RDD could be created using one of the following methods (1) reading data from file (2) parallelizing collection in the driver program (3) transforming from another RDD (4) and by transforming back from a persisted RDD [6]. An RDD comprises with two kinds of command, transformations and actions. A transformation command transforms an RDD to another RDD. These commands are map, filter and groupByKey, for example. Another set of commands are actions, which are collect and count, for example. An RDD keeps all previous transformation inside itself. This direct acyclic graph of transformation is known as lineage. The beginning of the real computation occurs only when an action is called. This is the lazy evaluation nature of Spark.
+A mechanism for failure recovery that helps an RDD to resume the processing without re-computation from scratch are methods such as cache, persist and checkpoint. The cache method uses persistency at MEMORY_ONLY, while the persist method has several levels of persistency. The checkpoint method, in contrast, uses the technique which save data onto a reliable storage, such as HDFS, Amazon S3 or Ceph. An RDD is usually cached or persisted during its computation to avoid re-computation previous steps [15].
+The checkpoint technique is also applicable for Spark Streaming because it truncates the internal lineage, so the RDD does not need to knowledge of its parent. However, this mechanism is not designed for software testing. The re-computation is still required to start from the beginning when the testcase is re-run. The rerunning of the testcase destroys a Block Manager inside an Executor. This Block Manage is responsible for keeping cached and persisted data. The new Driver program and the testcase therefore is not able to access the location of checkpoints.
+In addition, Spark has introduced the Dataframe API in 1.3 and Dataset in 1.6. Both abstractions can be used interchangeably because Dataset[Row] is the type safer version of DataFrame. A dataset is also convertible to an RDD. In the case of DTC proposed in this paper, we read and write data directly without triggering any computation of related RDDs.
+B. Debugging framework for Spark
+A technique used to improve quality of the software is debugging. Developers usually debug to observe certain set of variables they are interested. However, in the Data-intensive Scalable Computing (DISC), the debugging process is difficult as data are computed distributedly on a cluster.
+BigDebug [7] is a tool designed to helps Spark’s developers deal with debugging a Big Data program. There is a downside that the tool requires user’s interaction during the debugging process. Those interactions make the debugging more difficult than those of normal programs because the Big Data programs are distributed by nature. Moreover, a BigDebug program cannot tackle the problem when the RDD being debug requires changes. The whole debugging process needs to start over in that case. In case of the developer changing codes on-the-fly, the RDD will become in-consistent as some partitions of the RDD has been processed by the old version of codes, while other partitions will be processed by the new codes. BigDebug support Spark up to 1.2.1 as the time writing.
+C. Checkpoint implementation for Spark
+Researchers have been employed the checkpoint of Spark in many ways to improve its efficiency, as follows.
+Flint [26] was created atop the original checkpoint technique of Spark. It aims at applying checkpoint and store their data on transient instances to reduce the VM usage cost. A transient instance in a kind of low-cost computing unit, which can be recalled anytime by its cloud provider. Flint solves this problem by writing an RDD’s partitions to an HDFS, which is operated on on-demand instances. We found that this implementation lacks a mechanism to prevent re-calculation when JVM is terminated. In addition,
+their checkpoint will be saved automatically so developers need to prepare a huge amount of space in order to prevent the full of storage, which can lead to the failure of the whole system.
+TR-Spark [27] implements the similar approach as Flint. The difference is that TR-Spark allows fined-granularity checkpoints at task-level. By leveraging this level of checkpoints, the storage usage cloud be reduced in comparison to checkpoint the whole RDD. However, TR-Spark makes it difficult to use as developers need to collect the information of VM failure to let it know the failure probability. TR-Spark does not deal with changes of the Driver program.
+Automatic Spark Checkpointing (ASC) [25] was designed to help analyze the trade-off between RDD checkpointing and its restore. ASC performs this computation by estimating them from an RDD lineage. Nevertheless, this technique does not support checkpoint across JVM termination. It also lacks the ability to recognize the similarity or identity of an RDD.
+Spark-flow [24] aims to mitigate the effect of JVM termination for checkpoint restoration. It makes use of Distributed Collection (DC), a library similar to the Dataset API. DC is able to analyze an RDD at the bytecode level with ASM. It can identify the location of checkpoint calls, inside an anonymous function. It also uses the MD5 hash function to help detect changes at the bytecode level. However, DC has some downside as the following. First, when calling checkpoint on a DC, the data is re-read again after checkpointing. Second, when restoring from checkpoint, the action count will be triggered, so the re-computation kicks in. Finally, computation is mainly done on the Driver machine, so the mechanism is actually not distributed. This often causes Out-of-Memory exception inside the Driver program and it stops working.
+1 val data = sc.parallelize(Array(1,2,3,4,5)) 2 val distData = data.map(x => (x,1))
+3 distData.dtCheckpoint()
+4 distData.count()
+5 distData.collect()
+Fig. 1. Example of a dtCheckpoint call on an RDD
+
+Fig. 2. The dtCheckpointing mechanism inside DTC
+III. DESIGN AND IMPLEMENTATION
+Spark stores the RDD transformations in the form of a lineage graph a.k.a. the logical execution plan. When an action is triggered for a certain RDD, its job will be submitted to the DAG Scheduler to transform the RDD’s lineage into a directed acyclic graph, whose a vertex is an
+RDD partition and edge is a transformation. After that the staging process will be kicked in. This staging process will be started from the final action going backwards to the beginning of the RDD. However, in the real execution, the process will be performed from the beginning of the RDD forwardly to the final action. After the staging, the system obtains a set of Stages and Tasks.
+A checkpoint of an RDD however must be done before the first action is performed. From the source code in the Fig. 1, when a program starts to process an array of integer 1 to 5, the array will be passed as a parameter of method parallelize of class SparkContext. This result in a ParallelCollectionRDD stored in variable data. At line 2, each element from the data RDD is mapped with 1 using the map method as a key/value pair. The result is a MapPartitionsRDD stored in variable distData. At line 3, method dtCheckpoint is invoked. Please note that the original Spark and DTC both use the lazy evaluation mechanism, this means that the checkpoint method only marks at a certain point over the DAG, where checkpoints will happen there. At line 4, command distData.count() is the first action. When this first action is triggered, the checkpoint is not yet created. The computation then is started from the beginning of the RDD to the mark point. After that, the checkpoint is stored at the first upper directory level as a hash value generated by the mechanism of DTC. At the line no 5, method distData.collect() is invoked as the second action. The system will then check backwards from the action to the beginning of the RDD. This time the system will find a checkpoint already existed because there is a directory whose name matches with the hash. When the DAG Scheduler starts to transform the lineage, it uses the data directly from the checkpoint without re-computation. Please also note that action count() and collect() belong to the different jobs. The result computed by count() will not be included as an input for collect(), despite their order of execution.
+In Scala, it allows us to implement a new feature for a class by creating an Implicit Class then mixes it in to the existing classes, like RDD or Dataset. The DTC mechanisms proposed in this paper are implemented using that technique. With DTC as an Implicit Class, developers could still use all existing properties and behavior of an RDD, while having an additional method from DTC. Developers are also able to upgrade the Spark framework to the newer versions without rewriting this mechanism. DTC is more suitable for testing than Spark-flow, which has many abstraction layers. These abstraction makes it difficult to enhance capability of Spark-flow.
+A. DtCheckpointing
+This mechanism works when the method dtCheckpoint of an RDD or a DataSet is called. This call marks an RDD and also starts the Hashing RDD mechanism to obtain a directory path from hash transformation. If there is no directory matched the hash value, it means that the system never created that checkpoint. After the creation of the directory content of the RDD will be stored inside of it. But if the directory exists, the system will read the content as the data of the RDD. In Fig. 2, when an RDD is created using the parallelize method and is transformed with map followed by an invocation of dtCheckpoint. The sub-system
+DtCheckpointing kicks in to mark points in the RDD for later storing when action count is called.
+We usually perform the test on a Spark Cluster with SBT, which is an interactive build tool to help develop software with Java or Scala. SBT allows us to write a build file using Scala-based Domain Specific Language. It manages a program dependency with Apache Ivy. With DTC, we modify test commands of the SBT namely test, test-only, and test-quick to support not only the local execution but also in the real working cluster. We solve the problem of ClassNotFoundException and NoClassDefFoundError by making a fat jar via custom SBT task. So, we introduce testOnCluster for testing every testcase, testOnlyOnCluster to test a specific testcase, and testQuickOnCluster to test a certain testcase which may be failed from last time, or never tested or need re-computation. Our modification to SBT allows the new mode of testing on the real cluster.
+B. Hashing an RDD
+Hash function is a one-way function which can be used to check data modification. Eve one bit of data is changed this function notices that modification. In this paper, we will compare MD5, SHA-1 and SHA-256 because these algorithms have various speed of hash and resource usage.
+This technique of the DTC framework is able to track the change of an RDD because the generated transformations. So we can use this mechanism to detect modification of any transformation back to the original RDD. When an action is triggered, the DTC framework detects all RDD dependencies and prepares a clean bytecode available by the CleanF property of the RDD, following by preparing other Java bytecode’s files which related to the dependencies. In preparation stage, DTC uses ASM, a tool to manage a Java bytecode [17], which Scala internally uses it for the compilation mechanism. With a ASM, the DTC’s hashing an RDD mechanism can access Java class file at runtime and de-serialize them for reverse engineering propose. DTC needs to remove some brittle information such as LINENUMBER or serialVersionUID from a class file. With this information filtered out, we can detect changes of an RDD or DataSet even when the line numbers have been changed.
+The result of class file analysis in preparation stage, after unnecessary dependencies was eliminated, these dependencies will compute hash number and input data, which the origin of an RDD will compute hash number also. The computation is distributed computing with Spark’s accumulator in the first level hash number computation will
+SET hash_array = empty array of string
+IF (HASH_INPUT_DATA = true) THEN
+ READ each data partition from (RDD or DataSet) COMPUTE hash of each data partition
+ APPEND hashes to hash_array
+ENDIF
+Fig. 3. Pseudo codes of the mechanism of Hashing an RDD
+compute hash number of input data for every partition, and then collect and reorder result because unpredictable computation time. After that, the DTC will compute hash number of sorted hash number again. Fig. 3, illustrates the steps of hashing mechanism please note that the computation of input data is an option that can specify with dtCheckpoint(true).
+IV. EXPERIMENTS
+A. Cluster configuration
+The experiments presented in this paper have been conducted on a Spark cluster consisted of 10 nodes. Each node is an Intel Core i5-4570 Quad-core with 4 GB of RAM. The drive node is an Intel Xeon E5-2650V3 Deca-core with 8GB of RAM. We use Apache Spark 2.0 for the experiments along with Ceph as the distributed file system over these 10 nodes. The Ceph storage is 10 TB. The system architecture is illustrated in Fig. 4.
+TABLE I
+COMPUTATION PROGRAMS AND INPUT DATA OF EXPERIMENTAL Program Input dataset
+Wordcount 31 GB of Wikipedia
+Triangle Counting 875,713 vertices and 5,105,039 edges PageRank 875,713 vertices and 5,105,039 edges Pi Estimation 109 times
+
+Fig. 4. The cluster architecture used by the experiments
+B. Methodology
+For the experiments, we use a MapReduce program Wordcount on 31 GB data dump of Wikipedia, Triangle Counting with Google Web Graph [28], PageRank with Google Web Graph and the last one is Pi Estimation with one billion times. Each program with its input dataset is shown in Table I. The Wordcount Program splits sentences into array of words and counts them using both RDD and Dataset (or DC in case of Spark-flow) with different checkpoint mechanisms. We tested each checkpoint mechanism 10 times continuously and measured both in space and time perspectives. Moreover, we tested 5 additional with JVM termination. Then we started the JVM again to test the recovery process of checkpoints.
+ Table II shows the comparison of checkpoint mechanism properties. If we do not use checkpoint, the system does not have the fault tolerance property. If we use the original Spark, it is not suitable for testing because its checkpoint mechanism does not work well in the test environment. In case of Spark-flow it does not work on the cluster environment out-of-the-box. DTC, on the other hand, is designed to address these problems in the testing
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1088
+TABLE II
+FEATURE COMPARISON BETWEEN CONFIGURATIONS
+Failure More abstraction Prevent re-calculation Suitable for
+Method Cluster
+tolerance layer from beginning Testing
+No-Checkpoint No No No No Yes Spark Original Yes No Yes Not Suitable Yes Spark-flow Yes Yes Yes Yes No DTC Yes No Yes Yes Yes
+TABLE III
+THE COMBINATION OF ALL EXPERIMENTAL CONFIGURATIONS
+Type Checkpoint Data Format Hash Algorithm Configuration RDD DataSet DC Java Kryo Avro Parquet MD5 SHA1 SHA256
+No-checkpoint √ √ - - - - - - - - Spark Original √ √ - √ - - - - - - Spark-flow - - √ - - - √ √ - - DTC √ √ - √ √ √ √ √ √ √
+environment. So, DTC provides the better environment to that we can multiply by 4 to roughly results Pi number. We support unit testing. tested 5 cases then stop the JVM, after that we re-run these
+Table II shows a brief differentiation of comparison 5 cases again on RDD.
+method that we will experiment. That meant, if we have no
+C. Experimental results (consecutively 10 cases)
+checkpoint it will lack failure tolerance, the Spark original
+checkpoint insufficient to testing. The Spark-flow push From the experiments, we start discussing in the case of developer in more abstraction layer by create a higher level no hashing input data, denoted not-hashinput by running of a DataSet and it not work on cluster naturally. In Table consecutively 10 cases. In this case the input will not be III, we show the combination of all experimental verified by hashing functions before the program starts. We configurations. Accordingly, the DTC introduce to rectify assume that development and during the tests. The that plain. experimental results are show in Fig. 5. At the first run,
+We compared with MapReduce Wordcount algorithms DTC and the original-checkpoint mechanism are
+on Wikipedia 31 GB with separating each word from each all slow with insignificant difference. The other with white space. And then, we filtered only word DTC-Java-SHA1 is slowest. It uses 636 seconds slightly
+occurred more than 10 million times, after that asserted TABLE IV
+with the most word occurred. We consecutively repeated CHECKPOINT’S STORAGE USAGE OF AN RDD
+these steps 10 cases and performed testing on 5 cases then Storage usage Size Unit stopped the JVM. After that we re-run these 5 cases again No-checkpoint 0 MB
+on both RDD and DataSet. Spark original checkpoint 9.870 MB
+Next, we compared with Triangle Counting Program DTC-Java-with-hash 0.987 MB
+which gathers the number of vertices whose has two DTC-Java-without-hash 0.987 MB adjacent vertices with an edge between them. And then DTC-Kryo-with-hash 0.501 MB perform PageRank Program to ranks members onto the DTC-Kryo-without-hash 0.501 MB
+graph. Input of these programs came from Google Web
+Graph. with 875,713 vertices and 5,105,039 edges, testing TABLE V
+on 5 cases then stop the JVM, after that re-run these 5 cases CHECKPOINT’S STORAGE USAGE OF DATASET
+again on RDD. Storage usage Size Unit Finally, we compared the Pi Estimation program by using No-checkpoint 0 MB Monte Carlo algorithm shows in (1) [29]. Spark original checkpoint 9.860 MB DTC-Avro-with-hash 0.987 MB
+DTC-Avro-without-hash 0.987 MB 2%/3&4*ℎ/ 5,)* -)%-./ DTC-Parquet-with-hash 0.993 MB
+ℙ($%&'()*ℎ), -)%-./) = 2%/3&4*ℎ/ 6753%/ DTC-Parquet-without-hash 0.993 MB Spark-flow 9.930 MB
+∬{)*+,*-.}1 %&%'
+=
+∬{0.-),,-.}1%&%' different from original-checkpoint. The π (1) no-checkpoint configuration does not have this startup
+= 4 overhead, so it run at 136 seconds on average. For the first
+The algorithm randomly generated two values which run, All DTC and the original-checkpoint are 4.7 represent to coordinate x and y of unit circle (so both x and times or slower than the no-checkpoint mechanism. y are between -1 to 1). After that, trying to addition However, all DTC configurations are significantly faster in between square magnitude of x and square magnitude of y the subsequence runs.
+and if that result less than or equal to 1 will be count as fall Fig. 6 shows the comparison between cases of applying in the unit circle. That number will use to represent π/4, so hash functions over input data to allow the system to detect
+
+Fig. 5. Comparison of checkpoint time of RDDs without hashing inputs using the Fig. 6. Comparison of checkpoint time of RDDs with hashing inputs using the
+Wordcount program. (10 cases consecutively) Wordcount program. (10 cases consecutively)
+
+Fig. 7. Comparison of checkpoint time of DataSet,including Spark-flow without Fig. 8. Comparison of checkpoint time of DataSet,including Spark-flow with
+hashing inputs using the Wordcount program (10 cases consecutively). hashing inputs using the Wordcount program (10 cases consecutively).
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018
+changes of the input. It shows that DTC mechanisms are slower than no-checkpoint and original-checkpoint only in the first run. In the subsequence runs, DTC mechanisms make the test s faster than those run by no-checkpoint and original-checkpoint. We found that DTC-Kryo-SHA1 is slowest in the first run. It uses 908 seconds on average, while no-checkpoint uses 136 seconds and original-checkpoint use 636 seconds.
+In the subsequence runs, DTC mechanism uses around 85 seconds on average. It is significantly faster that both no-checkpoint and original-checkpoint, which
+is 60%
+In the first run with hash input, the fastest DTC mechanism is DTC-Java-SHA256 it is 480% slower than no-checkpoint and 24% slower than original-checkpoint. In the subsequence runs, this mechanism is 40% faster than no-checkpoint and 590% faster than original-checkpoint. Other cases
+are in similar trends.
+In case of DataSet, we found the similar trends as the case of RDD. During the first run DTC mechanisms are slowest, and significantly faster in subsequence runs. Fig. 7 and Fig. 8 show the comparison between checkpoint mechanisms for the DataSet without hashing input and with hashing input, respectively. We also include Spark-flow
+in these experiments. We found that Spark-flow uses 752 seconds at the first run, while DTC-Parquet-MD5
+uses 606 seconds, so DTC is 24% faster than Spark-flow. In case of hash input data, DTC is 40% slower than Spark-flow for the first run. However, in the subsequence runs, DTC dramatically reduces time spending, according aforementioned trends.
+The mechanism of checkpoint usually requires use of storage. The storage usage comparison is then presented in Table IV. According to the table, DTC with Java serializer uses the storage only one-tenth of those used by the original Spark checkpoint. In case of DTC with Kryo, it uses storage only 5% of the original-checkpoint.
+This storage usages are similar for DataSet. According to Table IV, DTC with Avro format uses only 10% of the original storage. In case of DTC with Parquet format, it uses only 11% of the original storage. Comparison of these results with Spark-flow, we are roughly at the same ration.
+DTC is designed to allow re-usability of RDDs and DataSets. It can traverse and detect change of the dependency of each RDD or a DataSet. From the experiments, we have found that DTC has a larger overhead than the mechanism of the Original Spark only when a testcases are in first run. When the testcases are in the later runs, DTC makes them 5-6 times faster than running by the Original Spark and Spark-flow. Moreover, DTC uses
+disk space 8-9 times less than both implementations as shown in Table IV and Table V.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1089
+(a) (b)
+Fig. 9. Comparison of checkpoint time of RDDs using the Wordcount program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs.
+(a) (b)
+Fig. 10. Comparison of checkpoint time of DataSet using the Wordcount program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018
+D. Experimental results (5 cases with JVM termination)
+In this section, we discuss the experimental results in case of running 5 cases consecutively, then stopping the JVM, after that the experimental cases were re-run again. Its behavior on different frameworks were observed.
+Firstly, we discuss the result of the Wordcount program on RDD. We found that DTC-Java-SHA256 used 542 seconds at the first run in case of running if before stopping JVM, so DTC is 9% faster than original-checkpoint which uses 596 seconds. After stopping JVM or closing the program then re-running the test cases, DTC with all settings used only few seconds to recover checkpoint, while other frameworks used hundreds of second, as showed in Fig 9. In Fig 9, the dashed line is the first running before JVM terminating and the solid line is the second running after restarting the JVM.
+In the case of DataSet shown you in Fig 10, the dashed line presents the first run of 5 cases. We found that the original-checkpoint used 654 seconds, while Spark-flow used 585 seconds. So, Spark-flow is 11%
+faster than the original one. But DTC with the DTC-Parquet-MD5 configuration, it used 595 seconds, 9% faster than original-checkpoint. However, in
+the second run of 5 cases after restarting the JVM, as the solid line, the results show that the original-checkpoint used 697 seconds and Spark-flow used 545 seconds, while DTC with any configuration used just few seconds.
+Fig. 11 shows the results comparing between frameworks using Triange Counting Program, In the case of not applying hashing to the input data, we showed that in Fig 11 (a), no-checkpoint, original-checkpoint and
+DTC used almost the same amount of time for the first runs.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018
+(a) (b)
+Fig. 11. Comparison of checkpoint time of RDDs using the Triangle Counting program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs.
+(a) (b)
+Fig. 12. Comparison of checkpoint time of RDDs using PageRank Program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1090
+For the second runs after restarting the JVM, we found the same trend as we were discussing earlier. DTC with all configurations could reduce time for testing to just a few seconds. Due to inputs were in the form of graph (vertices and edges) as shown in Fig 11 (b), the underlying mechanism of the Spark Framework tries to perform operations efficiently by casting the partition of the input to class ShippableVertexPartition. In the research work reported in this paper, DTC does not import to support to read this kind of data type. Fig 11 (b) shows that DTC with all configurations could not help reduce time much. All frameworks use the same amount of time processing the data.
+In Fig 12 shows the experimental results obtained from running the PageRank program. PageRank is a program that
+processes graphs. It used the same set of inputs as the previous experimental, Triangle Counting. In Fig 12 (a), it shows the results in the case of not applying hashing to the input data. We found that in the first testcase of the first run, the results of DTC with Java serialization, with either MD5 or SHA1 as the hash function, used 204 seconds, while the original-checkpoint used 214 seconds. In
+this comparison, DTC could speed up by 4%. For the rest of testcases, times spent by DTC is cut down to just a few seconds. In Fig 12 (b), we also found the same problem as of the Triangle Counting program. This was the result of hashing input.
+Finally, we discuss the results of the Pi Estimation program. In Fig. 13, we showed tenor of comparing frameworks. For the first testcase of the first run, we found
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018
+
+(a) (b)
+Fig. 13. Comparison of checkpoint time of RDDs using Pi Estimation Program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
+ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1091
+that without hashing inputs, the DTC-Kryo-SHA256 used 114 seconds, while the original-checkpoint used
+135 seconds as shown in Fig 13 (a) DTC was 18% faster in this case. In the consequent testcases, DTC could cut the running time significantly.
+In case of hashing inputs, we found the same trend as shown in Fig 13 (b) as the previous results. DTC used processing time almost the same as original-checkpoint at the first testcase then dramatically speed up by using only a few seconds for testing each testcase. Moreover, the DTC framework can be detected in case of random values, so that spark developers can reproduce the input which causes software is issues.
+V. CONCLUSIONS AND FUTURE WORK
+The experimental results have obviously shown that DTC is suitable for improving productivity for unit testing in Big Data applications in terms of time consumption and storage usage. We can perform testing for Big Data either on a local or a cluster. DTC could trace change in testcases with random values. Unfortunately, we found that DTC could work well in case of graph algorithms such as Triangle Counting or PageRank due to spark framework cast partition of an input to ShippableVertexPartition. So that one of limitation the DTC is input datatype. We are researching in potential mechanisms which can be used for increasing speed of testing and reducing storage usages such as cache and persist. The JVM configurations are ones of tuning parameter we are focusing. These subjects are being studied.
+REFERENCES
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+Copyright $Ò 2018 GiRI (Global IT Research Institute)
diff --git a/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt b/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt
new file mode 100644
index 0000000..b5717a0
--- /dev/null
+++ b/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt
@@ -0,0 +1,174 @@
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/
+Quality Control Framework of Big Data for Early Warning of
+Agricultural Meteorological Disasters
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+Jiale Li
+College of Ecology and Environment, Institute of Disaster Prevention
+Sanhe, Hebei, China
+lijiale_cumtb@126.com
+ABSTRACT
+Agricultural meteorological disasters, including floods, droughts, dry hot winds, low temperature chills, typhoons, hail and continuous rain, can lead to significant reduction in agricultural output. Big data platform for early warning of agricultural meteorological disaster is the basis of business operation system for early warning of agricultural meteorological disasters, and the data quality is an important guarantee for success of the early warning. Quality control of big data for early warning of agricultural meteorological disaster involves names of data sets, metadata, data documents and content of data sets. The quality control for contents of data sets is divided into quality control of attribute data and that of spatial data, and quality control of spatial data is divided into quality control of vector data and that of raster data. Methods for data quality control are divided into fully automatic, semi-automatic and full manual control methods.
+CCS CONCEPTS
+• Social and professional topics ~ Quality assurance • Hardware ~ Printed circuit boards • Computing methodologies ~ Machine learning
+KEYWORDS
+agro-meteorological disasters, early warning, big data, quality control, framework.
+1 Introduction
+Meteorological disasters are atmospheric natural disasters that cause harm to human life and property, cause losses to social and economic development, and have serious adverse effects on human production and life [1]. According to statistics from the United Nations World Meteorological Organization, meteorological disasters account for 60% of all natural disasters [2]. China is a country with frequent natural disasters, and food
+Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from Permissions@acm.org.
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China © 2019 Association for Computing Machinery. ACM ISBN 978-1-4503-7150-6/19/07…$15.00 https://doi.org/10.1145/3349341.3349371
+Shunbao Liao†
+College of Ecology and Environment, Institute of Disaster Prevention
+Sanhe, Hebei, China
+liaoshunbao@cidp.edu.cn
+production is greatly affected by natural disasters. About 70% of natural disasters are resulted from meteorological disasters [3].
+Agro-meteorological disasters are a general term for adverse weather or climatic conditions that occur in agricultural production processes and result in significant reduction in agricultural production, including floods, droughts, dry hot winds, low temperature chills, typhoons, hail and continuous rain [4]. Agro-meteorological disaster prevention needs to know a lot of information such as weather forecast, weather conditions, the scope of meteorological disasters, duration, intensity of disasters, population distribution of affected areas, number of large livestock, crop planting area, water irrigation status, etc. This information includes both spatial geographic information and a large number of weather attribute information inseparable from space [5]. Therefore, it is an effective method to combine high-tech such as remote sensing and GIS and conventional disaster monitoring and evaluation methods to monitor and evaluate major agrometeorological disasters [6]. Real-time quality control of meteorological data is of great significance for meteorological support of aviation activities and disaster prevention and mitigation [7].
+Data Quality Management is to improve data quality by refining and enhancing the management level of the organization. The management of data consists of a series of activities, which involve identification, measurement, monitoring, and early warning of data quality problems. These problems could be triggered off in one of the phases, which range from data planning, collection, storage, sharing, maintenance, and application to data destruction. Data quality assessment and management are generally measured in several dimensions, including completeness, conformity, consistency, accuracy, uniqueness, and integration [8].
+2 Big Data Platform for Early Warning of
+Agricultural Meteorological Disasters
+2.1 Platform Structure
+Big data platform for early warning of agricultural meteorological disasters and model system are the basis of early warning service operation system (as shown in Figure 1). Users call data from Big data platform and early warning models through the interface of early warning service system for agricultural meteorological disasters to realize the early warning of agricultural meteorological disasters. At the same time, the
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+business system stores the user's early warning results into Big data platform for other users to query.
+User1 User2 …… User n
+Operation system for agricultural meteorological disasters warning service (Interface)
+Big data platform for Models system for agricultural meteorological
+disasters warning disasters warning
+Basic data for agricultural
+meteorological disasters Model build/selection
+warning
+Figure 1: Operation business system for early warning service of agricultural meteorological disasters
+The quality control of big data for early warning of agrometeorological disasters refers to data quality inspection and data correction that arise in the process from basic data to Big data platform for agrometeorological disasters warning. However, the data quality issues that occur in the process from user operation results to Big data platform for agrometeorological disasters warning will not be discussed in this paper.
+2.2 Quality Control Objects
+Big data are divided into structured data and unstructured data, and the quality control of early warning big data for agricultural meteorological disasters is mainly for structured data. The large database of agricultural meteorological disaster warning consists of attribute database and spatial database. The attribute database includes real-time observation database (such as meteorological observation database) and non-real-time observation database (such as statistical survey database, historical climate database, etc.). The spatial database includes spatial vector database and spatial raster database. It was stipulated in this study that the object of quality control for big data of agricultural meteorological disasters warning was a data set, which was, a two-dimensional table in relational database, coverage in vector database or a grid layer in raster database.
+Quality control objects in Big data platform for early warning of agricultural meteorological disasters are listed in Table 1.
+Table 1. Quality control objects in the big data platform
+
+Data types at level 1
Data types at level 2
Quality control objects
Examples
Attribute data
Real-time observed data
Tables in relational database
real-time observed meteorological data
Non-real- time observed data
Tables in relational database
statistical survey data, historical climate data
Spatial data
Vector data
Vector layers
Land use, boundary
Raster data
Raster layers
DEM, NDVI
3 Contents of Quality Control
+According to data management strategy and actual situation of data, quality control of big data for agricultural meteorological disaster early warning was carried out at different levels, including quality control of data set names, metadata, data documents, and content of data sets. The quality control of content of data sets was divided into quality control of attribute data and that of spatial data, and quality control of spatial data was divided into quality control of vector data and that of raster data.
+3.1 Quality Control of Data Set Names
+Big data for agro-meteorological disaster warning are spatiotemporal data. The purpose of normalization of data set name is to let users know the spatiotemporal range, detail level and thematic content of data set by names of data sets, that is, the basic information about a dataset can be obtained by its name.
+Therefore, dataset names of big data for agrometeorological disaster warning should contain four elements, which are spatial scope (region), time range, detailed level and thematic content of data sets, but however the order of these elements can be adjusted according to the habit. The time range refers to the time of data acquisition, not the time when the data is published or released. The detail level of data may be scale of vector data, spatial resolution of raster data, or administrative division unit of statistical survey data. For the normalization of data set name, the example is as follows:
+Example: National 1:100,000 land use data (2015). Where "national" is the spatial range of data; "1:100,000" refers to the detail level of data; "Land use" is the thematic content; "2015" represents the time of the data.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+75
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+3.2 Metadata and Data Documents
+Metadata is data about data. It is information that describes a dataset. Metadata generally describes data sets by standardized entries, which are normative and uniform. Metadata can help users understand and apply data sets. Without metadata, users sometimes cannot fully interpret data. Therefore, metadata
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+conforming to norms and with sufficient information is an important means of data quality assurance.
+A data document is a file that describes a data set. Compared with metadata, data documents do not follow a strict coding specification, but they are sometimes critical to the user's understanding of data. For example, in some data sets, attribute elements are represented by codes consisting of letters and numbers, the description of the codes (including meaning, unit, etc.) is particularly important. Both metadata and data document are important means of data quality control, but they have their own characteristics. Metadata is more standardized, but the description of datasets by metadata is sometimes not specific. Data documents are not as standardized as metadata, but their description may be more specific. Therefore, metadata is relatively suitable for the standardized management of data sets, and data documents are more suitable for the interpretation and application of data sets by users. From the perspective of data quality control, either metadata or data documents should accompany data sets. It's best to have both.
+3.3 Quality Control of Contents of Data Sets
+Quality control of data set content is divided into quality control of attribute data and that of spatial data, and quality control of spatial data is divided into quality control of vector data and that of raster data.
+3.3.1 Quality Control of Attribute Data. Attribute data is also
+called two-dimensional tabular data, which is a table in a relational database. The attribute data in the agrometeorological disaster warning database mainly includes real-time and historical meteorological data, and statistical survey data.
+3.3.2 Quality Control of Real-Time and Historical Meteorological
+Data. For those kinds of data, meteorological stations are generally used as recording units, and the main contents of quality control are as follows:
+(a) Quality control of weather station codes: It is mainly checked whether the codes of weather stations are within the national standard codes database and whether the corresponding relationship between the codes and the names of weather stations is correct.
+(b) Quality control of spatial coordinates of weather stations: it is checked whether the longitude, latitude and altitude of weather stations are correct.
+(c) Quality control of time elements: it is checked whether the attribute value and the format of time for each record is correct.
+(d) Missing value check: checked contents include missing values for the fields that should have values, the percentage of missing values, and whether the missing values can be interpolated by some means, and so on.
+(e) Outlier check: according to the spatial-temporal variation law of meteorological data, check whether there is outlier in data sets by certain mathematical methods, whether to eliminate or correct them.
+(f) Logical rationality check: According to meteorological knowledge, check whether there exist the data inconformity to conventional logic. For example, whether the lowest value is
+greater than the highest value, or whether the average value is between the maximum value and the minimum value, and so on.
+(g) Checking of other obvious errors.
+3.3.2.1 Quality Control of Statistical Survey Data. Statistical survey data are generally recorded by administrative divisions, and the main contents of data quality control include:
+(a) Quality control of administrative divisions’ codes: check whether the administrative divisions’ codes are within the scope of the national standard, and whether the correspondence between the administrative divisions’ codes and their name is correct.
+(b) Quality control of time elements: check whether the attribute value and the format of time element for each record are correct.
+(c) Missing value check: which fields should have values but are actually missing, the percentage of missing values, whether they can be interpolated by some means, and so on.
+(d) Logical rationality check: according to the basic knowledge of statistics, check whether there exist the data inconformity to conventional logic. For example, in some administrative divisions, whether the total output of a certain crop is greater than the total grain output, whether the total crop output is equal to the planting area multiplied by the yield of a unit area, and whether the sum of the total grain output of the lower administrative divisions is equal to the total grain output of the higher administrative division, and so on.
+(e) Checking of other obvious errors.
+3.3.3 Quality Control of Spatial Data. Due to the inst ability of
+spatial entities, the limitations of human cognitive expression, the observation errors of spatial entities, and the errors in spatial data processing, spatial data can cause quality problems when expressing the real world. According to its sources, the error of geographic information spatial data can be divided into the original data error and the error introduced by the spatial database construction.
+3.3.3.1 Coordinate and Map Projection Checking. Spatial data
+includes vector data and raster data. Whether it is vector data or raster data, it first need to be checked whether its coordinate system including ellipsoid parameters and map projection parameters are consistent with the corresponding parameters defined in the database. If not, conversion and modification are required to ensure overlay and spatial analysis between spatial data to be carried out.
+3.3.3.2 Quality Control of Vector Elements. According to scale
+and thematic content of data sets, it should be checked whether vector features (lines and polygons) conform to corresponding mapping specifications, for example normalization of lines and minimum spot on maps. The reference specification for the quality control is mapping specification at corresponding scale.
+3.3.3.3 Quality Control of Raster Features. It should be checked
+whether the size of grid cells is the same as that indicated in the
+dataset name.
+3.3.3.4 Quality Control of Attribute Elements in Spatial Data
+Sets. For vector layer, the following contents should be checked:
+(a) Code correctness checking: it should be checked whether attribute codes of vector elements (such as administrative divisions’ codes, land use type code, etc.) are beyond codes base,
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+76
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+and whether the correspondence between codes and type names (such as administrative divisions’ names, names of land use type, etc.) is correct or not.
+(b) Name/code missing checking: it should be checked whether there exist unnamed or uncoded vector features (points, lines or polygons).
+(c) Checking of other attribute element values: it should be checked whether attribute values of vector features (such as temperature value in the isotherm) exceeds extreme limits.
+(d) Obvious errors checking: it should be checked whether there are obvious errors in data sets by GIS software and visualization means.
+For raster layers, the following contents should be checked:
+(a) Code correctness checking: it should be checked whether attribute codes of grid cells arc within code database.
+(b) Logical rationality checking: for example, whether NDVI values are between 0 and 1.
+(c) Missing value checking: it should be checked whether there exist grid cells without attribute values, the ratio of the grid cells without attribute values to all cells, and whether the missing values can be interpolated by some methods.
+(d) Outlier checking: such as cliff detection in DEM.
+(e) Extreme values checking: it should be checked whether the attribute values of grid cells (such as temperature) exceeds the extreme limits.
+(f) Obvious error checking: it can be visually checked whether there are obvious errors in raster layers by image processing system or GIS software.
+4 Methods of Quality Control
+Quality control methods of big data for early warning of agricultural meteorological disasters are divided into three types: automatic control methods, artificially interactive semi-automatic control methods and full manual control methods.
+relatively low update frequency and low timeliness requirements. For example, detection of coordinate systems and projection parameters of spatial data, cartographic normative detection of vector features in digital maps, identification of grid cell size in raster data, detection of code normalization and logic consistency of attribute data in statistical survey data, etc.
+4.3 Full Manual Control Methods
+The data quality problems are detected and analyzed completely by manual visual method. Some obvious data quality problems may not be discovered through automated or semi-automated methods, but experienced technicians can easily identify them through manual visual methods, for example, obviously nonstandard drawings in digital maps or illogical values of grid cells. Checking of name normalization of data sets is also usually done by manual inspection methods.
+5 Technological Process of Data Quality Control
+Based on the above analysis, we can draw a flow chart for data quality control of Big data platform for agricultural meteorological disaster warning, as shown in Figure 2.
+The data quality control process of Big data platform for agricultural meteorological disaster warning mainly includes:(1) data set name inspection, (2) data set content inspection. Quality control of data set content includes attribute data and spatial data. Attribute data are mainly used for meteorological observation data and statistical survey data. Spatial data are divided into vector data and raster data. Its quality control mainly checks the coordinate system and projection parameters, as well as the quality inspection of various spatial elements.
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+77
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+4.1 Automatic Data Quality Control Methods
+Instead of man-machine interaction, automatic data quality control methods realize data quality detection through computer software. The automatic methods are mainly aimed at real-time collected data with obvious characteristics of time series, such as real-time and quasi-real-time meteorological observation data. The quality inspection for real-time collected data needs not only high timeliness but also completing heavy workload. Only automated quality inspection can meet the needs of data quality control.
+Quality problems of historical meteorological observation data, and some quantitative quality problems in vector data and raster data, can also be detected by automatic methods.
+4.2 Semi-Automatic Quality Control Methods
+With participation of professional technicians, the quality of data sets is interactively checked and judged through statistical analysis software or RS/GIS software. This situation is mainly for vector data, raster data, statistical survey data, etc., which have
+
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+
+AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al.
+Big data for
+agricultural meteorological disasters warning
+Datasets: 2D attribute table / Vector data layer /Raster data layer Names of data sets Contents of data sets
+Normalization check for Quality control for contents of data sets names of data sets
+Attribute data Spatial data
+Whether Meteorological Vector Raster it observation Statistical data data
+N contains 4 data survey data layer layer major
+elements
+Y Coordinate system and map
+projection check
+Normative Grid cell
+detection of size vector features detection
+Code correctness Station code Code correctness
+Logical rationality Station coordinates Admin. codes Missing codes
+Missing values Time elements Time elements Abnorm. inspection
+Abnormal inspection Missing values Missing values Obvious errors
+Extreme check Outliers Logical rationality detection
+Obvious error Logical rationality …… ……
+detection
+……
+…… Semi-automatic Semi-automatic /
+Semi-automatic / Automatic detection detection manual detection
+manual
+Is there a Y
+quality
+problem
+N
+End
+Figure 2: Flow chart of data quality control for big data platform of agricultural meteorological disaster warning
+6 Conclusions and Discussions
+6.1 Conclusions
+The framework, objects, contents and methods of data quality control for Big data platform of agricultural meteorological disasters warning were analyzed systematically in this study. The following conclusions were drawn:
+(a) Data quality control is a basic work for construction of Big data platform of agricultural meteorological disasters warning, and it is also an important guarantee for success of early warning. In addition to the quality control of contents of data sets themselves, dataset names, metadata and data documents are also integral parts of data quality control for Big data platform of agricultural meteorological disaster warning.
+This document was truncated here because it was created in the Evaluation Mode.
+Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd.
+78
From 7109f0f5a8e48aa8cdf7fdc0dc339e8fa80d2c8c Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 5 Mar 2026 09:54:55 -0300
Subject: [PATCH 14/17] update docs to import to rag
---
scripts/build_rag_index.bat | 38 +
scripts/build_rag_index.py | 115 +
scripts/dev/start.bat | 16 +
src/rag/simple_rag.py | 21 +-
storage/vectorstore/documents.json | 11902 +++++++++++++++++++++------
5 files changed, 9358 insertions(+), 2734 deletions(-)
create mode 100644 scripts/build_rag_index.bat
create mode 100644 scripts/build_rag_index.py
diff --git a/scripts/build_rag_index.bat b/scripts/build_rag_index.bat
new file mode 100644
index 0000000..11c1e92
--- /dev/null
+++ b/scripts/build_rag_index.bat
@@ -0,0 +1,38 @@
+@echo off
+REM Build (or rebuild) the RAG vector store from docs_to_import/
+REM
+REM Usage:
+REM scripts\build_rag_index.bat -- build if not yet built
+REM scripts\build_rag_index.bat --rebuild -- wipe and rebuild from scratch
+chcp 65001 > nul
+
+:: Resolve project root (parent of scripts/)
+set SCRIPT_DIR=%~dp0
+pushd "%SCRIPT_DIR%.."
+set PROJECT_DIR=%CD%
+popd
+
+echo ============================================================
+echo RAG Index Builder
+echo ============================================================
+echo Project : %PROJECT_DIR%
+echo Source : %PROJECT_DIR%\docs_to_import
+echo Output : %PROJECT_DIR%\storage\vectorstore\documents.json
+echo ============================================================
+
+:: Activate virtual environment
+if exist "%PROJECT_DIR%\.venv\Scripts\activate.bat" (
+ call "%PROJECT_DIR%\.venv\Scripts\activate.bat"
+) else (
+ echo [WARNING] .venv not found — using system Python
+)
+
+:: Run the Python build script, forwarding any arguments (e.g. --rebuild)
+"%PROJECT_DIR%\.venv\Scripts\python.exe" "%PROJECT_DIR%\scripts\build_rag_index.py" %*
+if errorlevel 1 (
+ echo [ERROR] RAG index build failed.
+ exit /b 1
+)
+
+echo.
+echo [OK] RAG index ready.
diff --git a/scripts/build_rag_index.py b/scripts/build_rag_index.py
new file mode 100644
index 0000000..db04ed2
--- /dev/null
+++ b/scripts/build_rag_index.py
@@ -0,0 +1,115 @@
+"""Build (or rebuild) the RAG vector store from docs_to_import/.
+
+Usage:
+ python scripts/build_rag_index.py # build if not yet built
+ python scripts/build_rag_index.py --rebuild # wipe and rebuild from scratch
+
+The output is saved to storage/vectorstore/documents.json.
+On subsequent application starts the index is loaded directly from that file,
+so no document processing occurs at startup.
+"""
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+# ── Resolve project root and add src/ to path ────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+DOCS_DIR = PROJECT_ROOT / "docs_to_import"
+STORE_FILE = PROJECT_ROOT / "storage" / "vectorstore" / "documents.json"
+
+sys.path.insert(0, str(SRC_DIR))
+
+# ── Supported extensions (must match simple_rag.py) ──────────────────────────
+SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".csv"}
+
+
+def _count_files(folder: Path) -> int:
+ return sum(
+ 1 for f in folder.rglob("*") if f.is_file() and f.suffix.lower() in SUPPORTED_EXTENSIONS
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Build RAG index from docs_to_import/")
+ parser.add_argument(
+ "--rebuild",
+ action="store_true",
+ help="Delete existing index and rebuild from scratch",
+ )
+ args = parser.parse_args()
+
+ # ── Validate docs directory ───────────────────────────────────────────────
+ if not DOCS_DIR.exists():
+ print(f"[ERROR] docs_to_import directory not found: {DOCS_DIR}")
+ sys.exit(1)
+
+ file_count = _count_files(DOCS_DIR)
+ print("=" * 60)
+ print("RAG Index Builder")
+ print("=" * 60)
+ print(f" Source : {DOCS_DIR}")
+ print(f" Output : {STORE_FILE}")
+ print(f" Files : {file_count} supported files found")
+
+ # ── Skip if already built (unless --rebuild) ──────────────────────────────
+ if STORE_FILE.exists() and not args.rebuild:
+ import json
+
+ try:
+ with open(STORE_FILE, "r", encoding="utf-8") as fh:
+ data = json.load(fh)
+ existing = len(data.get("documents", {}))
+ except Exception:
+ existing = 0
+
+ print(f"\n[OK] Index already exists ({existing} documents).")
+ print(" Use --rebuild to wipe and re-process everything.")
+ print("=" * 60)
+ return
+
+ # ── Delete stale index when rebuilding ───────────────────────────────────
+ if args.rebuild and STORE_FILE.exists():
+ STORE_FILE.unlink()
+ print("\n[INFO] Existing index deleted — rebuilding from scratch...")
+
+ # ── Load RAG config and initialise SimpleRAG ─────────────────────────────
+ # Temporarily change cwd to src/ so relative paths inside SimpleRAG resolve
+ import os
+
+ original_cwd = os.getcwd()
+ os.chdir(SRC_DIR)
+
+ try:
+ from rag.config_simple import RAGConfig # type: ignore[import]
+ from rag.simple_rag import SimpleRAG # type: ignore[import]
+
+ config = RAGConfig.from_env()
+ # Override storage path to always use absolute project path
+ config.storage_path = STORE_FILE.parent
+
+ print("\n[INFO] Initialising RAG engine and importing documents…")
+ t0 = time.time()
+
+ # SimpleRAG.__init__ → _load_documents → _auto_import_or_fallback
+ # Since STORE_FILE was deleted (or never existed), it will auto-import.
+ rag = SimpleRAG(config)
+
+ elapsed = time.time() - t0
+ total_docs = len(rag.documents)
+ total_chunks = sum(len(c) for c in rag.document_chunks.values())
+
+ print(f"\n[OK] Index built successfully in {elapsed:.1f}s")
+ print(f" Documents : {total_docs}")
+ print(f" Chunks : {total_chunks}")
+ print(f" Saved to : {STORE_FILE}")
+ print("=" * 60)
+
+ finally:
+ os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/dev/start.bat b/scripts/dev/start.bat
index 8b1c13d..379a1ae 100644
--- a/scripts/dev/start.bat
+++ b/scripts/dev/start.bat
@@ -22,6 +22,22 @@ if not exist "%PROJECT_DIR%.venv\Scripts\activate.bat" (
:: Ativar ambiente virtual Python
call "%PROJECT_DIR%.venv\Scripts\activate.bat"
+:: ── RAG Index: build once, reuse forever ─────────────────────────────────────
+set RAG_INDEX=%PROJECT_DIR%storage\vectorstore\documents.json
+if not exist "%RAG_INDEX%" (
+ echo [RAG] Índice RAG não encontrado. Construindo a partir de docs_to_import/...
+ echo [RAG] Isso ocorre apenas na primeira execucao ou apos --rebuild.
+ call "%PROJECT_DIR%scripts\build_rag_index.bat"
+ if errorlevel 1 (
+ echo [AVISO] Falha ao construir índice RAG. O backend usará fallback.
+ ) else (
+ echo [RAG] Índice pronto.
+ )
+) else (
+ echo [RAG] Índice RAG já existe — carregando do cache.
+)
+echo.
+
:: Iniciar backend em uma nova janela
echo [1/3] Iniciando backend...
start cmd /k "title Backend && cd %PROJECT_DIR%src && %PROJECT_DIR%.venv\Scripts\python.exe api.py"
diff --git a/src/rag/simple_rag.py b/src/rag/simple_rag.py
index d9af521..3cc292f 100644
--- a/src/rag/simple_rag.py
+++ b/src/rag/simple_rag.py
@@ -12,7 +12,7 @@
from typing import Dict, List, Optional
# Supported file types for auto-import
-_AUTO_IMPORT_EXTENSIONS = {".txt", ".md", ".pdf"}
+_AUTO_IMPORT_EXTENSIONS = {".txt", ".md", ".pdf", ".csv"}
class SimpleRAG:
@@ -140,6 +140,25 @@ def _extract_text_from_file(self, file_path: Path) -> Optional[str]:
print("[WARNING] PyPDF2 not installed; PDF files cannot be imported.")
except Exception as e: # pylint: disable=broad-exception-caught
print(f"[WARNING] Could not read PDF {file_path.name}: {e}")
+ elif suffix == ".csv":
+ try:
+ import pandas as pd # pylint: disable=import-outside-toplevel
+
+ for encoding in ("utf-8", "latin-1", "cp1252"):
+ try:
+ df = pd.read_csv(file_path, encoding=encoding)
+ # Convert DataFrame to readable text: header + rows
+ lines = [" | ".join(str(c) for c in df.columns)]
+ for _, row in df.iterrows():
+ lines.append(" | ".join(str(v) for v in row.values))
+ return "\n".join(lines)
+ except UnicodeDecodeError:
+ continue
+ return None
+ except ImportError:
+ print("[WARNING] pandas not installed; CSV files cannot be imported.")
+ except Exception as e: # pylint: disable=broad-exception-caught
+ print(f"[WARNING] Could not read CSV {file_path.name}: {e}")
return None
def _auto_import_from_folder(self, folder: Path):
diff --git a/storage/vectorstore/documents.json b/storage/vectorstore/documents.json
index 4632058..1967a2e 100644
--- a/storage/vectorstore/documents.json
+++ b/storage/vectorstore/documents.json
@@ -1,587 +1,698 @@
{
"documents": {
- "328a07f5-149e-46ee-b0de-0163aedfcde3": {
- "id": "328a07f5-149e-46ee-b0de-0163aedfcde3",
- "content": "Apache Spark Best Practices for Data Quality\n\nIntroduction:\nApache Spark is a powerful distributed computing framework that excels at processing large datasets. When implementing data quality checks, following best practices ensures optimal performance and reliable results.\n\nKey Principles:\n\n1. Partition Strategy\n - Use appropriate partitioning to avoid skewed data\n - Consider partition size (aim for 128MB-1GB per partition)\n - Use repartition() vs coalesce() appropriately\n\n2. Caching Strategy\n - Cache DataFrames that are accessed multiple times\n - Use appropriate storage levels (MEMORY_AND_DISK_SER)\n - Unpersist when no longer needed\n\n3. Data Quality Patterns\n - Implement schema validation early in the pipeline\n - Use built-in Spark functions for better performance\n - Avoid collect() on large datasets\n\n4. Error Handling\n - Implement graceful degradation for data quality issues\n - Log quality metrics for monitoring\n - Use checkpoints for long-running processes\n\n5. Resource Management\n - Configure executor memory and cores appropriately\n - Monitor garbage collection patterns\n - Use dynamic allocation when possible\n\nExample Code Patterns:\n\ndef validate_data_quality(df):\n # Count nulls efficiently\n null_counts = df.select([\n sum(col(c).isNull().cast(\"int\")).alias(f\"{c}_nulls\")\n for c in df.columns\n ]).collect()[0]\n \n # Check for duplicates\n total_rows = df.count()\n distinct_rows = df.distinct().count()\n duplicate_rate = (total_rows - distinct_rows) / total_rows\n \n return {\n 'null_counts': null_counts.asDict(),\n 'duplicate_rate': duplicate_rate,\n 'total_rows': total_rows\n }\n\nPerformance Tips:\n- Use broadcast joins for small lookup tables\n- Prefer DataFrames over RDDs for better optimization\n- Use columnar formats like Parquet for better I/O\n- Enable adaptive query execution (AQE) in Spark 3.0+",
- "metadata": {
- "filename": "spark_best_practices.txt",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\spark_best_practices.txt",
- "file_size": 1974,
- "file_type": ".txt",
- "imported_at": "2025-12-17T21:23:23.679465",
- "content_length": 1918
- }
- },
- "bcd1c4e9-600f-4286-8599-481901d79c71": {
- "id": "bcd1c4e9-600f-4286-8599-481901d79c71",
+ "e9b99d1a-4e76-4a90-86ce-dd08c0bdb107": {
"content": "# Estratégias de Validação de Dados\n\n## Validação em Tempo Real vs Batch\n\n### Validação Batch\n- Processa grandes volumes de dados históricos\n- Permite análises complexas e estatísticas\n- Ideal para relatórios e auditorias\n- Menos recursos em tempo real\n\n### Validação em Tempo Real\n- Verifica dados conforme chegam\n- Permite correção imediata\n- Requer mais recursos de infraestrutura\n- Crítica para sistemas transacionais\n\n## Implementação com Kafka + Spark Streaming\n\n```python\nfrom pyspark.streaming import StreamingContext\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\n\ndef process_streaming_data(df, epoch_id):\n # Validações em tempo real\n \n # 1. Schema validation\n expected_schema = [\"id\", \"timestamp\", \"value\", \"status\"]\n if set(df.columns) != set(expected_schema):\n raise ValueError(\"Schema mismatch detected\")\n \n # 2. Business rules validation\n invalid_records = df.filter(\n (col(\"value\") < 0) | \n (col(\"value\") > 1000) |\n (col(\"status\").isNull())\n )\n \n invalid_count = invalid_records.count()\n if invalid_count > 0:\n print(f\"Warning: {invalid_count} invalid records found\")\n # Log para sistema de monitoramento\n \n # 3. Store valid records\n valid_records = df.subtract(invalid_records)\n valid_records.write.mode(\"append\").saveAsTable(\"clean_data\")\n\n# Setup streaming\nspark = SparkSession.builder.appName(\"DataQualityStream\").getOrCreate()\ndf = spark.readStream.format(\"kafka\") \\\n .option(\"kafka.bootstrap.servers\", \"localhost:9092\") \\\n .option(\"subscribe\", \"data-topic\") \\\n .load()\n\nquery = df.writeStream \\\n .foreachBatch(process_streaming_data) \\\n .start()\n\nquery.awaitTermination()\n```\n\n## Padrões de Quarentena\n\n### Isolamento de Dados Problemáticos\n1. **Quarentena Automática**: Dados que falham validação básica\n2. **Quarentena Manual**: Dados suspeitos para revisão humana\n3. **Quarentena Temporária**: Dados aguardando informações adicionais\n\n### Estrutura de Quarentena\n```\nquarantine/\n├── schema_errors/\n├── business_rule_violations/\n├── data_quality_issues/\n└── pending_review/\n```\n\n## Métricas de Qualidade\n\n- **Completude**: % de campos preenchidos\n- **Validade**: % de dados em formato correto\n- **Consistência**: % de dados consistentes entre fontes\n- **Precisão**: % de dados corretos\n- **Atualidade**: % de dados recentes",
"metadata": {
"filename": "data_validation_strategies.md",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\data_validation_strategies.md",
- "file_size": 2518,
- "file_type": ".md",
- "imported_at": "2025-12-17T21:23:23.687972",
- "content_length": 2378,
- "type": "markdown",
- "title": "Estratégias de Validação de Dados"
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\data_validation_strategies.md",
+ "size": 2518,
+ "source": "docs_to_import"
+ },
+ "id": "e9b99d1a-4e76-4a90-86ce-dd08c0bdb107"
+ },
+ "cc69ebd7-3f8a-4062-a785-e2a5f9dae6c7": {
+ "content": "Link\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing",
+ "metadata": {
+ "filename": "all_posts_mined.csv",
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\mrs_oliveira2025\\all_posts_mined.csv",
+ "size": 395611,
+ "source": "docs_to_import"
+ },
+ "id": "cc69ebd7-3f8a-4062-a785-e2a5f9dae6c7"
+ },
+ "0f5718f6-5185-4066-9015-9979707fad52": {
+ "content": "Link\nhttps://dev.to/dataform/testing-data-quality-with-sql-assertions-248g\nhttps://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n\nhttps://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm\nhttps://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4\nhttps://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp\nhttps://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22\nhttps://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63\nhttps://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk\nhttps://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730\nhttps://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j\nhttps://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo\nhttps://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb\nhttps://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd\nhttps://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl\nhttps://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a\nhttps://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp\nhttps://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell\nhttps://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i\nhttps://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8\nhttps://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47\nhttps://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf\nhttps://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag\nhttps://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic\nhttps://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh\nhttps://dev.to/namnguyen\nhttps://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5\nhttps://dev.to/codexam/why-is-big-data-important-40ha\nhttps://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533\nhttps://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j\nhttps://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo\nhttps://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob\nhttps://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52\nhttps://dev.to/jeremystan/airbnb-quality-data-for-all-280f\nhttps://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43\nhttps://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top\nhttps://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908\nhttps://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km\nhttps://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e\nhttps://dev.to/daryashirokova\nhttps://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4\nhttps://dev.to/reneebetina\nhttps://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363\nhttps://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a\nhttps://dev.to/apssouza22/tech-lead-playbook-523\nhttps://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56\nhttps://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm\nhttps://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest\nhttps://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm\nhttps://dev.to/dataform\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii\nhttps://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce\nhttps://dev.to/berthaw82414312\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi\nhttps://dev.to/tinybirdco\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm\nhttps://dev.to/madgan95/introduction-to-big-data-analysis-4cg1\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7\nhttps://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i\nhttps://dev.to/andyb1979/android-chart-performance-comparison-5ej7\nhttps://dev.to/habereder/comment/po6j\nhttps://dev.to/bytebodger/litmus-tests-in-tech-1ll7\nhttps://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp\nhttps://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf\nhttps://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest\nhttps://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e\nhttps://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62\nhttps://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi\nhttps://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo\nhttps://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1\nhttps://dev.to/t/testing/page/73\nhttps://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd\nhttps://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm\nhttps://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49\nhttps://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p\nhttps://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk\nhttps://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage\nhttps://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection\nhttps://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data\nhttps://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo\nhttps://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data\nhttps://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process\nhttps://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data\nhttps://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python\nhttps://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data\nhttps://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut\nhttps://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow\nhttps://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r\nhttps://stackoverflow.com/questions/65289092/python-mysql-insert-big-data\nhttps://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other\nhttps://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter\nhttps://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w\nhttps://stackoverflow.com/questions/64961961/shared-array-for-big-data\nhttps://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu\nhttps://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i\nhttps://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list\nhttps://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels\nhttps://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming\nhttps://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk\nhttps://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year\nhttps://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution\nhttps://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget\nhttps://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data\nhttps://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data\nhttps://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes\nhttps://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets\nhttps://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server\nhttps://stackoverflow.com/questions/64014590/application-insights-with-big-data\nhttps://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but\nhttps://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high\nhttps://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data\nhttps://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop\nhttps://stackoverflow.com/questions/61221081/random-forest-for-big-data\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler\nhttps://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base\nhttps://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data\nhttps://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data\nhttps://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations\nhttps://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core\nhttps://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data\nhttps://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view\nhttps://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data\nhttps://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame\nhttps://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0\nhttps://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded\nhttps://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse\nhttps://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data\nhttps://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster\nhttps://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data\nhttps://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data\nhttps://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql\nhttps://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary\nhttps://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design\nhttps://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas\nhttps://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization\nhttps://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation\nhttps://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data\nhttps://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file\nhttps://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling\nhttps://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python\nhttps://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python\nhttps://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c\nhttps://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index\nhttps://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql\nhttps://stackoverflow.com/questions/61506168/return-big-data-using-pymongo\nhttps://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data\nhttps://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group\nhttps://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse\nhttps://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data\nhttps://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql\nhttps://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel\nhttps://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r\nhttps://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python\nhttps://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data\nhttps://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data\nhttps://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data\nhttps://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient\nhttps://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python\nhttps://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny\nhttps://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data\nhttps://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data\nhttps://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into\nhttps://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d\nhttps://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists\nhttps://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set\nhttps://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python\nhttps://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data\nhttps://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga\nhttps://stackoverflow.com/questions/60384558/big-data-conditional-agregration\nhttps://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb\nhttps://stackoverflow.com/questions/60306007/python-big-data-regression\nhttps://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net\nhttps://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview\nhttps://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets\nhttps://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic\nhttps://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data\nhttps://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file\nhttps://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python\nhttps://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk\nhttps://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data\nhttps://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services\nhttps://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects\nhttps://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster\nhttps://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining\nhttps://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v\nhttps://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in\nhttps://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data\nhttps://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc\nhttps://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native\nhttps://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat\nhttps://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r\nhttps://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r\nhttps://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds\nhttps://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form\nhttps://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c\nhttps://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data\nhttps://stackoverflow.com/questions/69758458/big-data-structure\nhttps://stackoverflow.com/questions/69787453/big-data-analytics-using-spark\nhttps://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for\nhttps://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data\nhttps://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native\nhttps://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user\nhttps://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps\nhttps://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in\nhttps://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel\nhttps://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time\nhttps://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl\nhttps://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl\nhttps://stackoverflow.com/questions/69284626/big-data-manipulations-with-python\nhttps://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds\nhttps://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data\nhttps://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown\nhttps://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data\nhttps://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages\nhttps://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data\nhttps://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram\nhttps://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra\nhttps://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data\nhttps://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing\nhttps://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data\nhttps://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post\nhttps://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file\nhttps://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql\nhttps://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql\nhttps://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage\nhttps://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data\nhttps://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api\nhttps://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values\nhttps://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data\nhttps://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system\nhttps://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark\nhttps://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data\nhttps://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t\nhttps://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r\nhttps://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql\nhttps://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data\nhttps://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak\nhttps://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed\nhttps://stackoverflow.com/questions/66744410/laravel-delete-big-data\nhttps://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c\nhttps://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql\nhttps://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql\nhttps://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s\nhttps://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice\nhttps://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data\nhttps://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members\nhttps://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api\nhttps://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle\nhttps://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark\nhttps://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data\nhttps://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks\nhttps://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set\nhttps://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta\nhttps://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files\nhttps://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category\nhttps://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose\nhttps://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out\nhttps://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas\nhttps://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce\nhttps://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection\nhttps://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas\nhttps://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript\nhttps://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db\nhttps://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09\nhttps://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485\nhttps://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e\nhttps://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf\nhttps://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON\nhttps://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948\nhttps://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259\nhttps://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb\nhttps://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201\nhttps://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e\nhttps://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2\nhttps://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63\nhttps://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e\nhttps://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9\nhttps://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9\nhttps://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d\nhttps://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7\nhttps://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab\nhttps://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3\nhttps://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390\nhttps://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b\nhttps://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b\nhttps://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c\nhttps://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364\nhttps://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053\nhttps://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5\nhttps://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259\nhttps://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8\nhttps://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f\nhttps://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0\nhttps://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7\nhttps://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570\nhttps://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b\nhttps://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b\nhttps://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0\nhttps://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84\nhttps://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5\nhttps://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d\nhttps://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e\nhttps://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4\nhttps://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f\nhttps://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510\nhttps://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d\nhttps://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa\nhttps://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6\nhttps://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b\nhttps://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff\nhttps://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e\nhttps://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b\nhttps://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6\nhttps://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e\nhttps://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17\nhttps://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b\nhttps://medium.com/@Dima/big-data-checklist-1b8e3214f96\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22\nhttps://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e\nhttps://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165\nhttps://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee\nhttps://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69\nhttps://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615\nhttps://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c\nhttps://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2\nhttps://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246\nhttps://medium.com/@hans.knechtions/test-in-production-85224e7a82f3\nhttps://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494\nhttps://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127\nhttps://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9\nhttps://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a\nhttps://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867\nhttps://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf\nhttps://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7\nhttps://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83\nhttps://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187\nhttps://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1\nhttps://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08\nhttps://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946\nhttps://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973\nhttps://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3\nhttps://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143\nhttps://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082\nhttps://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76\nhttps://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618\nhttps://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67\nhttps://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93\nhttps://informationit27.medium.com/explain-big-data-testing-b555517f9902\nhttps://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be\nhttps://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql\nhttps://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck\nhttps://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet\nhttps://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output\nhttps://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory\nhttps://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files\nhttps://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data\nhttps://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash\nhttps://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id\nhttps://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data\nhttps://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data\nhttps://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec\nhttps://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js\nhttps://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows\nhttps://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python\nhttps://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb\nhttps://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data\nhttps://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t\nhttps://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating\nhttps://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data\nhttps://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss\nhttps://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api\nhttps://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data\nhttps://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt\nhttps://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set\nhttps://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data\nhttps://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches\nhttps://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark\nhttps://stackoverflow.com/questions/76104308/randomforest-for-big-data\nhttps://stackoverflow.com/questions/76103457/variable-selection-in-big-data\nhttps://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox\nhttps://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases\nhttps://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server\nhttps://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set\nhttps://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable\nhttps://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame\nhttps://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls\nhttps://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template\nhttps://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter\nhttps://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data\nhttps://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r\nhttps://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb\nhttps://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files\nhttps://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data\nhttps://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch\nhttps://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug\nhttps://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin\nhttps://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python\nhttps://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once\nhttps://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts\nhttps://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an\nhttps://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue\nhttps://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors\nhttps://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data\nhttps://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python\nhttps://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark\nhttps://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file\nhttps://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data\nhttps://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data\nhttps://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob\nhttps://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w\nhttps://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data\nhttps://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit\nhttps://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time\nhttps://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage\nhttps://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common\nhttps://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data\nhttps://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle\nhttps://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance\nhttps://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python\nhttps://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data\nhttps://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an\nhttps://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data\nhttps://stackoverflow.com/questions/73274450/big-data-in-tableview\nhttps://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference\nhttps://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin\nhttps://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql\nhttps://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data\nhttps://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c\nhttps://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set\nhttps://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery\nhttps://stackoverflow.com/questions/72914084/historical-big-data-slow-queries\nhttps://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way\nhttps://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file\nhttps://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix\nhttps://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java\nhttps://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python\nhttps://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys\nhttps://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu\nhttps://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data\nhttps://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy\nhttps://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame\nhttps://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames\nhttps://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements\nhttps://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data\nhttps://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise\nhttps://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values\nhttps://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines\nhttps://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g\nhttps://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns\nhttps://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data\nhttps://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object\nhttps://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data\nhttps://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data\nhttps://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python\nhttps://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error\nhttps://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar\nhttps://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor\nhttps://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu\nhttps://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data\nhttps://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3\nhttps://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native\nhttps://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of\nhttps://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment\nhttps://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca\nhttps://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data\nhttps://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks\nhttps://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves\nhttps://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data\nhttps://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps\nhttps://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality\nhttps://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB\nhttps://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl\nhttps://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality\nhttps://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK\nhttps://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan\nhttps://www.linkedin.com/pulse/big-data-testing-qa-touch\nhttps://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir\nhttps://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7\nhttps://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory\nhttps://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen\nhttps://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle\nhttps://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran\nhttps://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/\nhttps://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow\nhttps://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf\nhttps://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e\nhttps://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc\nhttps://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay\nhttps://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov\nhttps://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc\nhttps://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB\nhttps://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-\nhttps://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post\nhttps://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing\nhttps://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering\nhttps://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair\nhttps://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy\nhttps://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson\nhttps://www.linkedin.com/pulse/testing-big-data-gagan-mehra\nhttps://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment\nhttps://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment\nhttps://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw\nhttps://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f\nhttps://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport\nhttps://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami\nhttps://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin\nhttps://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR\nhttps://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management\nhttps://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking\nhttps://www.linkedin.com/pulse/data-quality-testing-grant-brodie\nhttps://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308\nhttps://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla\nhttps://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan\nhttps://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta\nhttps://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter\nhttps://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov\nhttps://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa\nhttps://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc\nhttps://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality\nhttps://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca\nhttps://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri\nhttps://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye\nhttps://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner\nhttps://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5\nhttps://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf\nhttps://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card\nhttps://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki\nhttps://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics\nhttps://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az\nhttps://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc\nhttps://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci\nhttps://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria\nhttps://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier\nhttps://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc\nhttps://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin\nhttps://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik\nhttps://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha\nhttps://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot\nhttps://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325\nhttps://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953\nhttps://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj\nhttps://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view\nhttps://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality\nhttps://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f\nhttps://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r\nhttps://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data\nhttps://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports\nhttps://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data\nhttps://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis\nhttps://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data\nhttps://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino\nhttps://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c\nhttps://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data\nhttps://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust\nhttps://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index\nhttps://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data\nhttps://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r\nhttps://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt\nhttps://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a\nhttps://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back\nhttps://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am\nhttps://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b\nhttps://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table\nhttps://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data\nhttps://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func\nhttps://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter\nhttps://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data\nhttps://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data\nhttps://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse\nhttps://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string\nhttps://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages\nhttps://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment\nhttps://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data\nhttps://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way\nhttps://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov\nhttps://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data\nhttps://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data\nhttps://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame\nhttps://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications\nhttps://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column\nhttps://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data\nhttps://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda\nhttps://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools\nhttps://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv\nhttps://stackoverflow.com/questions/28236897/replace-outliers-from-big-data\nhttps://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data\nhttps://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado\nhttps://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data\nhttps://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored\nhttps://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl\nhttps://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models\nhttps://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel\nhttps://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client\nhttps://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand\nhttps://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same\nhttps://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data\nhttps://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift\nhttps://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data\nhttps://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram\nhttps://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data\nhttps://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case\nhttps://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods\nhttps://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f\nhttps://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh\nhttps://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set\nhttps://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications\nhttps://stackoverflow.com/questions/48997676/error-message-for-processing-big-data\nhttps://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text\nhttps://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data\nhttps://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data\nhttps://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz\nhttps://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize\nhttps://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se\nhttps://stackoverflow.com/questions/31428581/incremental-pca-on-big-data\nhttps://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file\nhttps://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set\nhttps://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame\nhttps://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace\nhttps://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data\nhttps://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel\nhttps://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data\nhttps://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing\nhttps://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi\nhttps://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and\nhttps://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage\nhttps://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise\nhttps://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data\nhttps://stackoverflow.com/questions/44502825/performance-testing-on-big-data\nhttps://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive\nhttps://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as\nhttps://stackoverflow.com/questions/31162894/how-to-create-big-data-project\nhttps://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different\nhttps://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr\nhttps://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications\nhttps://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c\nhttps://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file\nhttps://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri\nhttps://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern\nhttps://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing\nhttps://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system\nhttps://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products\nhttps://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data\nhttps://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data\nhttps://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data\nhttps://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data\nhttps://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms\nhttps://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api\nhttps://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job\nhttps://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil\nhttps://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift\nhttps://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented\nhttps://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing\nhttps://stackoverflow.com/questions/48373636/big-data-in-datalab\nhttps://stackoverflow.com/questions/58725538/do-we-visualize-big-data\nhttps://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don\nhttps://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python\nhttps://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand\nhttps://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error\nhttps://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository\nhttps://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas\nhttps://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database\nhttps://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data\nhttps://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php\nhttps://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial\nhttps://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files\nhttps://stackoverflow.com/questions/58308006/big-data-load-in-salesforce\nhttps://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b\nhttps://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key\nhttps://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data\nhttps://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices\nhttps://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark\nhttps://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow\nhttps://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest\nhttps://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db\nhttps://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e\nhttps://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data\nhttps://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data\nhttps://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana\nhttps://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data\nhttps://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data\nhttps://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction\nhttps://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data\nhttps://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi\nhttps://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app\nhttps://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second\nhttps://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db\nhttps://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting\nhttps://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data\nhttps://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments\nhttps://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures\nhttps://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed\nhttps://sqa.stackexchange.com/questions/37718/big-data-application-testing",
+ "metadata": {
+ "filename": "cleaned_all_posts_mined.csv",
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\mrs_oliveira2025\\cleaned_all_posts_mined.csv",
+ "size": 73869,
+ "source": "docs_to_import"
+ },
+ "id": "0f5718f6-5185-4066-9015-9979707fad52"
+ },
+ "ea7f56ff-e3f5-4c67-97d6-51f906d3e001": {
+ "content": "link | ferramentas | metodo\nhttps://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp | JUnit, JUnit 5, JUnit, Jest | Integration Testing\nhttps://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 | nan | Exploratory Testing\nhttps://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 | Selenium | nan\nhttps://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo | nan | Test-Driven Development\nhttps://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi | Selenium | nan\nhttps://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl | nan | Regression Testing\nhttps://dev.to/sudo_pradip/dbt-and-software-engineering-4006 | nan | Regression Testing, Unit Testing, Acceptance Testing\nhttps://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a | Jest | Behavior-Driven Development, Integration Testing, Load Testing\nhttps://dev.to/m1pko/data-quality-technical-debt-from-hell | nan | Regression Testing\nhttps://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 | Cucumber | Test-Driven Development\nhttps://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf | Selenium, Appium | Regression Testing\nhttps://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i | Mockito, Jest | Unit Testing, Integration Testing\nhttps://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa | Selenium | nan\nhttps://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 | JUnit, JUnit | nan\nhttps://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja | nan | Regression Testing\nhttps://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin | Selenium, Cucumber, Appium | Regression Testing, Unit Testing, Integration Testing\nhttps://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c | nan | Smoke Testing\nhttps://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii | nan | Unit Testing, Integration Testing\nhttps://dev.to/berthaw82414312 | Selenium, Appium | Test-Driven Development, Exploratory Testing, Regression Testing, Unit Testing, Integration Testing\nhttps://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi | nan | Regression Testing, Load Testing\nhttps://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm | nan | Regression Testing, Acceptance Testing, Load Testing\nhttps://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 | nan | Regression Testing, Unit Testing\nhttps://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i | Selenium | nan\nhttps://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf | nan | Unit Testing, Integration Testing\nhttps://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p | Selenium, Appium | nan\nhttps://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j | JUnit, JUnit | Test-Driven Development, Unit Testing\nhttps://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e | Selenium, TestNG, Appium, Jest | Exploratory Testing, Regression Testing\nhttps://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db | Selenium | nan\nhttps://dev.to/meghasharmaaaa/devops-toolchain-mlo | JUnit, Selenium, TestNG, JUnit | nan\nhttps://dev.to/t/testing/page/73 | Selenium, Postman, Jest | Regression Testing, Integration Testing\nhttps://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm | Selenium | nan\nhttps://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter | nan | Load Testing\nhttps://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler | nan | Load Testing\nhttps://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data | nan | Load Testing\nhttps://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db | nan | Unit Testing\nhttps://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON | Cucumber | Unit Testing\nhttps://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 | nan | Load Testing\nhttps://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 | nan | Unit Testing\nhttps://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c | nan | Unit Testing\nhttps://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff | nan | Unit Testing\nhttps://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b | nan | Regression Testing\nhttps://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 | nan | Unit Testing, Integration Testing, Acceptance Testing\nhttps://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e | nan | Regression Testing\nhttps://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 | nan | Integration Testing\nhttps://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 | JUnit, JUnit | nan\nhttps://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c | nan | Unit Testing, Integration Testing\nhttps://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 | nan | Regression Testing, Integration Testing\nhttps://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 | JUnit, JUnit | Unit Testing\nhttps://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 | nan | Smoke Testing\nhttps://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality | Selenium | nan\nhttps://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory | Selenium | nan\nhttps://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects | JUnit, Selenium, TestNG, Cucumber, JUnit | Test-Driven Development, Behavior-Driven Development, Regression Testing, Unit Testing, Integration Testing, Acceptance Testing, Smoke Testing, Load Testing\nhttps://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle | nan | Regression Testing, Integration Testing, Load Testing\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e | nan | Acceptance Testing, Load Testing\nhttps://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your | nan | Regression Testing, Unit Testing, Integration Testing\nhttps://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov | Selenium | Test-Driven Development\nhttps://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing | nan | Test-Driven Development, Unit Testing, Integration Testing\nhttps://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- | nan | Test-Driven Development, Exploratory Testing, Unit Testing\nhttps://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair | Selenium | nan\nhttps://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy | nan | Unit Testing, Integration Testing\nhttps://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment | nan | Unit Testing, Integration Testing, Acceptance Testing\nhttps://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f | Selenium, Cucumber, Appium | nan\nhttps://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e | nan | Regression Testing\nhttps://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory | nan | Acceptance Testing\nhttps://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z | nan | Smoke Testing\nhttps://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla | nan | Unit Testing\nhttps://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri | Selenium, TestNG | nan\nhttps://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye | Selenium | nan\nhttps://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki | Selenium, Appium | nan\nhttps://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view | nan | Exploratory Testing",
+ "metadata": {
+ "filename": "cleaned_posts_with_test_tools_and_methods (1).csv",
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\mrs_oliveira2025\\cleaned_posts_with_test_tools_and_methods (1).csv",
+ "size": 8647,
+ "source": "docs_to_import"
+ },
+ "id": "ea7f56ff-e3f5-4c67-97d6-51f906d3e001"
},
- "1f4646b3-0d61-4f89-94b5-2c5e534ce81c": {
- "id": "1f4646b3-0d61-4f89-94b5-2c5e534ce81c",
+ "8fa98f66-7342-4f10-ba38-925a481d5132": {
"content": "# Guia de Testes de Performance\n\n## Introdução\n\nTestes de performance são essenciais para garantir que aplicações de dados funcionem adequadamente sob carga. Este documento aborda estratégias e técnicas para testar sistemas de big data.\n\n## Tipos de Testes de Performance\n\n### 1. Testes de Carga\n- Verificar comportamento sob carga normal\n- Identificar limites de capacidade\n- Monitorar tempo de resposta e throughput\n\n### 2. Testes de Stress\n- Testar além dos limites normais\n- Identificar ponto de quebra do sistema\n- Verificar recuperação após sobrecarga\n\n### 3. Testes de Volume\n- Grandes volumes de dados\n- Avaliar escalabilidade\n- Testar limites de armazenamento\n\n## PySpark para Performance\n\n```python\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\n\n# Configuração otimizada\nspark = SparkSession.builder \\\n .appName(\"PerformanceTest\") \\\n .config(\"spark.sql.adaptive.enabled\", \"true\") \\\n .config(\"spark.sql.adaptive.coalescePartitions.enabled\", \"true\") \\\n .config(\"spark.sql.adaptive.skewJoin.enabled\", \"true\") \\\n .getOrCreate()\n\n# Monitoramento de performance\ndef monitor_query_performance(df, query_name):\n start_time = time.time()\n result = df.count() # ou qualquer operação\n end_time = time.time()\n \n print(f\"Query: {query_name}\")\n print(f\"Tempo: {end_time - start_time:.2f}s\")\n print(f\"Registros: {result}\")\n \n return result\n```\n\n## Métricas Importantes\n\n- **Latência**: Tempo de resposta individual\n- **Throughput**: Operações por segundo\n- **Utilização de CPU**: Percentual de uso\n- **Memória**: Consumo e garbage collection\n- **I/O**: Leitura/escrita de dados\n\n## Ferramentas de Monitoramento\n\n- Spark UI para análise de jobs\n- Ganglia para métricas de cluster\n- Grafana para dashboards\n- JProfiler para análise de JVM",
"metadata": {
"filename": "performance_testing.md",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\performance_testing.md",
- "file_size": 1889,
- "file_type": ".md",
- "imported_at": "2025-12-17T21:23:23.700967",
- "content_length": 1801,
- "type": "markdown",
- "title": "Guia de Testes de Performance"
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\performance_testing.md",
+ "size": 1889,
+ "source": "docs_to_import"
+ },
+ "id": "8fa98f66-7342-4f10-ba38-925a481d5132"
+ },
+ "0c4d5da7-7180-4aca-9b24-cb17459173ac": {
+ "content": "# docs_to_import/\n\nPlace documents (PDF, TXT) here to import into the RAG knowledge base.\n\nRun the import script from the project root:\n\n```bash\npython utilities/import_documents.py docs_to_import\n```\n\n⚠️ Files in this directory are **not versioned** (see `.gitignore`).\nUse the import script after cloning the repository to populate the knowledge base.\n\n## Supported Formats\n- PDF (`.pdf`)\n- Plain text (`.txt`)\n- Markdown (`.md`)\n",
+ "metadata": {
+ "filename": "README.md",
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\README.md",
+ "size": 453,
+ "source": "docs_to_import"
+ },
+ "id": "0c4d5da7-7180-4aca-9b24-cb17459173ac"
},
- "877f4762-0d7c-4c31-ab5f-f50d0487235e": {
- "id": "877f4762-0d7c-4c31-ab5f-f50d0487235e",
- "content": "[Página 1]\nAdvancing beyond technicism\nwhen managing big data in\ncompanies ’decision-making\nFrancesco Caputo, Barbara Keller, Michael Möhring, Luca Carrubbo andRainer Schmidt\nAbstract\nPurpose –In recognising the key role of business intelligence and big data analytics in influencing\ncompanies’ decision-making processes, this paper aims to codify the main phases through which\ncompanies can approach, develop and manage big data analytics.\nDesign/methodology/approach –By adopting a research strategy based on case studies, this paper\ndepicts the main phases and challenges that companies ‘‘live’’ through in approaching big data analytics\nas a way to support their decision-making processes. The analysis of case studies has been chosen as\nthe main research method because it offers the possibility for different data sources to describe aphenomenon and subsequently to develop and test theories.\nFindings –This paper provides a possible depiction of the main phases and challenges through which\nthe approach(es) to big data analytics can emerge and evolve over time with reference to companies’decision-making processes.\nResearch limitations/implications –This paper recalls the attention of researchers in defining clear\npatterns through which technology-based approaches should be developed. In its depiction of the main\nphases of the development of big data analytics in companies’ decision-making processes, this paper\nhighlights the possible domains in which to define and renovate approaches to value. The proposed\nconceptual model derives from the adoption of an inductive approach. Despite its validity, it is discussedand questioned through multiple case studies. In addition, its generalisability requires further discussion\nand analysis in the light of alternative interpretative perspectives.\nPractical implications –The reflections herein offer practitioners interested in company management\nthe possibility to develop performance measurement tools that can evaluate how each phase can\ncontribute to companies’ value creation processes.\nOriginality/value –This paper contributes to the ongoing debate about the role of digital technologies in\ninfluencing managerial and social models. This paper provides a conceptual model that is able to\nsupport both researchers and practitioners in understanding through which phases big data analytics\ncan be approached and managed to enhance value processes.\nKeywords Big data, Big data analytics, Companies’ decision-making, Smarter management\nPaper type Technical paper\n1. Preliminary reflections\nIn the past few decades, socio-economic configurations have profoundly changed because\nof the increasing use and accessibility of Information and Communication Technologies\n(ICT) in multiple domains of everyday life ( Forester, 1987 ;Turban et al. ,1 9 9 8 ;Drucker, 2011 ;\nCaputo et al. , 2019b ). Consolidated views based on the representation of technologies for\ndata management as “simple instruments” for supporting decision-making activities have\nprogressively shown that they are incapable of explaining ongoing dynamics and trends\n(Caputo et al. , 2019c ). Similarly, new interpretative approaches and managerial models are\nstrongly required by researchers and practitioners interested in effectively understandingFrancesco Caputo is based at\nthe Department of Economics,Management and Institutions,University of Naples Federico\nII, Naples, Italy.\nBarbara Keller is based at theDuale Hochschule Baden-Wu¨rttemberg Stuttgart,\nStuttgart, Germany.Michael Mo ¨hring is based at\nthe Department of Informatics –\nHHZ Reutlingen University,Reutlingen, Germany.Luca Carrubbo is based at theDepartment of Managementand Innovation Systems,University of Salerno, Salerno,Italy.\nRainer Schmidt is based at the\nDepartment of ComputerScience and Mathematics,University of Applied SciencesMunich, Munich, Germany.\nReceived 8 October 2022\nRevised 26 January 2023Accepted 25 February 2023\nCorrigendum : It has come to\nthe attention of the publisher\nthat the article: Caputo, F.,\nKeller, B., Mo ¨hring, M.,\nCarrubbo, L. and Schmidt, R.(2023), “Advancing beyondtechnicism when managing bigdata in companies’ decision-making”, Journal of Knowledge\nManagement , Vol. ahead-of-\nprint No. ahead-of-print.\nhttps://\ndoi.org/10.1108/JKM-10-2022-\n0794 did not accurately display\nMo¨hring, M.‘s affiliation.\nOur guidelines state that\naffiliations should be supplied\nin full when the article issubmitted.\nThe city corresponding to\nReutlingen University has been\namended from Munich toReutlingen.\nDOI10.1108/JKM-10-2022-0794 VOL. 27 NO. 10 2023, pp. 2797-2809, ©Emerald Publishing Limited, ISSN 1367-3270 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2797\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 2]\nwhat the main implications, consequences and effects of the increasing use of ICT in\nbusiness and social dynamics are ( Castells, 1999 ;Markus and Topi, 2015 ).\n[2015] Building upon this widely recognised need, in recent decades, a challenging debate\nhas emerged around the topic of big data analytics as “a way of extracting value from thesehuge volumes of information, and it drives new market opportunities and maximizes\ncustomer retention” (\nZakir et al. ,2 0 1 5 , p. 81). Several contributions have been provided\nwith reference to the multiple advantages that it is possible to obtain for companies from a“new” approach in the collection, coding and management of data related to the multiple\ndimensions of shopping expeditions and evaluations (\nGriffin et al. ,2 0 0 0 ;Mummalaneni,\n2005 ;Demangeot and Broderick, 2006 ;Amendola et al. , 2018 ;Ardito et al. ,2 0 1 8 ). Multiple\nstimuli for reflections have also been provided with reference to the ways in which people,processes and technologies can be combined to improve the quality of companies’ and\nmarkets’ approaches in data collection and use (\nAlter, 2006 ;Singh and Del Giudice, 2019 ).\nAs effectively summarised by Demchenko et al. (2012 , p. 614), “Data Science is becoming\na new technology driver and requires re-thinking a number of infrastructure, components,solutions and processes to address the following general challenges: Exponential growth of\ndata volume produced by different research instruments and/or collected from sensors;\nNeed to consolidate e-Infrastructure as [a] persistent research platform to ensure researchcontinuity and oration, deliver/offer persistent services, with [an] adequate governancemodel.” According to the authors’ reflections, the challenging domain about big data should\nmainly refer to the infrastructure and processes required for ensuring the effective collection\nand organisation of a huge volume of data.\nDespite the relevance of the aforementioned dimensions, it only represents a “small” part of\nthe multiple reflections that seem to require the ongoing transitions towards a knowledge\nera based on technology infrastructure. Several relevant elements related to human\napproaches to big data, the consequences of big data analytics in companies’ decision-making processes and the antecedents capable of addressing the ongoing digital transition(\nCaputo et al. , 2019a ;Chinnaswamy et al. ,2 0 1 8 ), among others, seem to be vastly\nunderestimated. Accordingly, the paper proposes extending current perspectives in the\nstudy of big data analytics by focusing attention on the intriguing domain of big dataanalytics, specifically “the extraction of hidden sight about consumer behaviour from bigdata and the exploitation of that insight through advantageous interpretation” (\nErevelles\net al. ,2 0 1 6 , p. 897). Thanks to the adoption of a research strategy based on case studies,\nthe paper aims to depict the main phases that companies face in the process of reshapingdecision-making processes through big data analytics. The analysis of case studies has\nbeen chosen as the main research method because it offers the possibility for different data\nsources to describe a phenomenon and subsequently to develop and test theories.\nThe paper is structured as follows. In Section 2, the theoretical background will be\npresented by focusing attention on smart management and on the role of big data analytics\nin companies’ decision-making processes as relevant domains with reference to which\nproposed reflections have been developed. In Section 3, the method and data collection ofthe proposed research will be reported, whilst in Section 4, the results of the proposedresearch will be summarised to enrich the current debate about the role of big data\nanalytics in reshaping companies’ decision-making processes. Finally, in Section 5, the\nstudy’s preliminary conclusions, main limitations, implications and possible future directionswill be presented.\n2. Theoretical background\nThe way in which organisations apply data analysis has changed over time ( Chen et al. ,\n2012 ). In recent years, different methods have been developed that depend on the different\ndata sources and related data structures.\nPAGE 2798jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 3]\nIn general, different data sources with structured and/or unstructured data can be part of\nbig data projects ( Gandomi and Haider, 2015 ). In the past, enterprises were only able to\nanalyse structured datasets like customer order data coming from, for example, CRM orERP systems (\nChen et al. , 2012 ). The data used for analyses mainly consisted of numbers\nor categorial variables, for example. The way of collecting, storing and analysing data was\nless complex in comparison to more recent data sources containing unstructured data(\nBuneman et al. , 1997 ;Blumberg and Atre, 2003 ;Baker and Thien, 2020 ;Del Giudice et al. ,\n2021 ). Today, however, up to 90% of the collected data is unstructured data like texts,\nimages, audio and video ( Harbart, 2021 ). The analysis of unstructured data is currently\nchallenging organisations because of its unsuitability for use in conventional data models(\nHarbart, 2021 ). The use of unstructured data together with structured data is manifold. For\ninstance, it can be used to improve the quality and the possibilities of prediction within big\ndata analytics ( Davenport et al. , 2021 ). Nevertheless, the more data types are included in\nanalytical projects, the more different methods must be used. Today, more and more IoT-related data sources like connected home appliances (\nBayer et al. , 2020 ) or services like\nGoogle Popular times ( Mo¨hring et al. ,2 0 2 0 ) can be used to predict and better understand\ncustomer behaviour. These new data sources must be integrated within the analytical\nlandscape to be used in related analysis. Another interesting use case that highlights thechallenges of the benefits of big data analytics is product returns in e-commerce. This fieldis even more important because it meets both customer behaviour and the sustainability\nconcept, as well as helping to easily understand the facets appearing in big data analysis.\nFor instance, if an organisation wants to use online customer reviews (unstructured textualdata) to predict the product returns probability (\nSchmidt and Mo ¨hring, 2013 ;Mo¨hring et al. ,\n2013 ), past customer order data from the CRM and ERP system (structured data) as well as\nimages (unstructured image data) from offered goods should also be integrated into the\nanalysis to enhance the quality of the prediction. Therefore, they must apply differentmethods like text mining for textual data, image pattern recognition for images and\ntraditional data mining techniques like regression or correlation analysis. In turn, this means\nthat different results, various key figures and quality criteria must be aggregated andharmonised within one comprehensive result (\nKaur et al. , 2019 ).\nFurthermore, the data must be stored in different locations like relational databases for the\norder data and/or within NoSql databases ( Stonebraker, 2010 ) like document-based\ndatabases for textual data. In sum, all these requirements will increase the complexity of big\ndata analytics projects and generate challenges for organisations running an analytical\nproject. In line with the identified methodological complexity and storing issues, thecomputational complexity also increases. The more variables are included in analyticalapproaches, the more steps for information processing and result calculation are\nnecessary. Therefore, organisations that are considering applying big data analytics must\nexplore the option of scalable public cloud computing services at major sites like AmazonAWS, Microsoft Azure and Google Cloud to capture the limitations of traditional non-scalable systems (\nSchmidt and Mo ¨hring, 2013 ).\n2.1 Challenges and dynamics of smart management\nNowadays, the dynamics in decision-making in all contexts are increasingly guided and\nconditioned by the reception, filtering, processing and use of data ( Raisinghani, 2000 ). The\nevolution of new technologies favours the development of virtuous processes [thanks to bigdata analytics techniques, data mining, machine learning, artificial intelligence (AI), etc.]\nthat support decision-making processes (\nNutt, 2008 ;Yang et al. , 2019 ). The growing\nuncertainty in all application areas accentuates the importance of the way in whichdecisions are made, especially if they involve significant consequences for the community.Decision making is a multidisciplinary topic that lends itself to different levels of analysis\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2799\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 4]\nwhen we focus on the various elements (including technological ones) that condition or\nfacilitate it ( Papadakis et al. ,1 9 9 8 ).\nDecision-making processes are increasingly data-driven. Therefore, the decisions are more\n“informed” because the exchange of information is rapid (often in real-time); hence, it canbe precise, punctual, efficient and valid. From the electronic medical record (now fully\noperational) to the development of information systems to new communication protocols, it\nis possible to record a continuous flow of data, information (and the contained in it) and the\ninputs to be filtered, processed, used and managed in a timely manner (\nSharma et al. ,\n2014 ). The risk of “data-deluge” and the difficulty of having useful elements is very high,\nwhile the possibility of making quick, accurate, thoughtful decisions becomes more and\nmore necessary, indeed fundamental ( Sabherwal and King, 1995 ;Citroen, 2011 ). In this\nsense, the evolution of decision-support-systems (DSS) assumes increasing importance inmany critical “moments”, both for descriptive-analytics (e.g. diagnostics, evaluations and\nmonitoring), as well as in the follow-up analytics in the operational phases and even for\nforecasting possible choices in the future and related reasons through predictive analyticsand prescriptive analytics (\nBoonstra, 2003 ).\nIn general, information sharing with shared databases, data-storage, data extraction and data\nprocessing favours the design of a more functional, versatile, scalable, context-friendly service\nprovision, where the smart management can make a difference thus deserves to be furtherexplored. For this, it becomes important to study the main characteristics of data that can be\nacquired. Here, the so-called “10V[s] of big-data” (Volume, Velocity, Variety, Veracity, Value,\nValidity, Variability, Venue, Vocabulary and Vagueness) are often taken into consideration tounderstand how new knowledge is generated and, consequently, how much decision-making\nprocesses are affected; particularly with reference to the possible advantages of meta dating,\ndata modelling, architecture and data integration (\nManogaran et al. ,2 0 2 2 ).\nSimilarly, the most frequently used methods are studied to improve decision-making from\ndata management’s point of view. Typical topics of interest here are cloud computing for\ninformation sharing, artificial intelligence for the data interpretation available and the\ngeneration of new ones like data mining and machine learning. The aim here is to betterunderstand how the information flow works, what criticalities it presents, how it feeds the\nactivation and management of known protocols, how it integrates the various data-sources\nand how it supports the management of queries (\nHicks et al. ,2 0 0 6 ).\nAll this effectively integrates decision-making techniques (cost benefits, grid analysis, paired\ncomparison, compensatory strategies, etc.), with particular reference to conditions of uncertainty\nbecause of, for example, systematic errors, cognitive biases, risk situations, external distortions,\ninformation asymmetries, misalignments, internal friction, misunderstandings, technical oradministrative misunderstandings, legal aspects, technological crashes or even weak signals\nescaping, somatic markers and negative contingencies.\nThese issues are so fundamental and interesting that in the period between 2021 and 2027,\nEuropean investments will be geared towards building a smarter Europe throughinnovation, digitalisation, economic transformation and support for small- and medium-\nsized enterprises. EIT Digital has launched the 2022 call to promote entrepreneurship and\neducation for the construction of a strong digital Europe and contribute to the developmentof digital technology, digital industry, digital cities, digital wellbeing and digital finance.\nSince 2014, the European Commission has spoken out in favour of a thriving data-driven\neconomy (\nEuropean Commission, 2014 ); in 2015, it discussed a strategy for the digital\nsingle market in Europe ( European Commission, 2015 ). In 2018, the International Data\nCorporation estimated an increase of 16 trillion gigabytes of data, with an annual growth\nrate of 236% in terms of data generation to date; they linked this to the fact that decisions\nbased on knowledge generated by big data can lead to increased productivity andcompetitiveness and GDP (equal to 1.9% by 2020) (\nReinsel et al. , 2018 ).\nPAGE 2800jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 5]\nToday, the evolving trend of Big Data Analyses is an integral part of a new digital market.\nAccording to the European Commission, it guarantees the development of innovative and\ncompetitive business models. However, while having to comply with the EU data protectionframework, big data can involve significant risks and challenges, especially in fundamentalrights like privacy and data protection. More recently, the European Parliament discussedthe role of the data-based economy in the strategy for the digital union against the backgroundof all stakeholders and their daily life situations, such as consumers (ease of use, efficiency\nand savings), businesses (industry 4.0) and public administration (e-government), housing\n(smart cities), science, medicine (Mhealth), disaster response capacity and the fight againstcrime, etc.\n2.2 Big data in companies’ decision-making processes\nScientists and researchers have long since faced the challenges of data management,focusing their attention on possible ways to collect data both directly and indirectly(\nSapsford and Jupp, 1996 ;Hajian and Domingo-Ferrer, 2012 ). Several experiments have\nbeen conducted aiming to define the processes and protocols that enhance the\neffectiveness of data collection as a relevant way to extend consolidated knowledge aboutthe reasons, antecedents and motivations behind actors’ behaviours and decisions inmultiple domains (\nGrant and Mayer, 2009 ;Guiot and Roux, 2010 ;Daunt and Harris, 2012 ;\nRahrovani and Pinsonneault, 2020 ). Along this line, studies focusing on companies’\ndecision-making have also been developed and multiple approaches for collecting andanalysing data have been investigated (\nGoulding, 1999 ;Rokka and Uusitalo, 2008 ;Pac¸o\nand Lavrador, 2017 ).\nNowadays, all these approaches and contributions seem to be outmoded against the\nbackground of the disruptive role of big data analytics in the data and knowledge managementprocesses (\nPauleen and Wang, 2017 ). Today, big data infrastructure supports the handling of\ndata operations by facilitating the source’s integration and collaboration in real time with highstandards for control and data safety (\nSagiroglu and Sinanc, 2013 ).\nDemchenko et al. (2014 , p. 105) reports “the Big Data definition as having the following 5V\nproperties: Volume, Velocity, Variety that constitute native/original Big Data properties, andValue and Veracity as acquired as a result of data[’s] initial classification and processing in\nthe context of a specific process or model.” These properties effectively summarise the\nrelevant contributions that big data can provide the management of a high volume of data inreal time without “damaging” the granularity of information to ensure a realisticrepresentation of the phenomenon (\nPolyakova et al. ,2 0 1 9 ).\nAccording to Erevelles et al. (2016) , the properties of big data seem to provide a valuable\nsolution for organisations striving to find an answer to environmental and social changesthrough predictive approaches about market trends. More comprehensively, big data offersorganisations the opportunities to increase:\n/H17039their dynamic capabilities –their “ability to respond to change incorporates skills and\nknowledge embedded within the organization to alter existing resources and createnew value” ( Erevelles et al. , 2016 , pp. 898 –899); and\n/H17039adaptive capabilities –as capabilities that do not derive “from a specific change in\norganizational structure but from the overall ability to capture consumer activities andextract hidden in-sights” ( Erevelles et al. , 2016 , p. 899).\nRecognising the disruptive role of big data in reinventing firms’ market approaches, it is\npossible to underline its contribution in supporting enterprises in innovating theirrelationships with the market by focusing on the “implementation of creative ideas”(\nGumusluoglu and Ilsev, 2009 , p. 61). From this perspective, big data analytics can be seen\nas a valuable approach that supports firms to enforce their relationship by focusing on the\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2801\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 6]\ndefinition of the innovation management path based on their “ability to effectively acquire\nand exploit new information” ( Chaston et al. ,2 0 0 1 , p. 147). Data acquisition and exploitation\nbecame the bridge with the capacity to link innovation management, information management\nand market analysis under the common umbrella of big data analytics; this offers the\nopportunity to understand current interest in developing an effective model for information\nmanagement, allowing firms to better understand (and predict) market trends and\nexpectations based on big data analytics ( Erevelles et al. ,2 0 1 6 ).\nIn a nutshell, big data can be considered a disruptive innovation ( Caputo et al. ,2 0 1 7 ) that is\npotentially able to reinvent firms’ approach to market analysis. Accordingly, Davenport et al.\n(2012 , p. 43) stated that big data supports firms “to understand their business environments\nat a more granular level, [ ...] creating new products and services, and [ ...] responding\nmore quickly to change as it occurs.” As a result, a new challenge emerges concerning how\nto decode the pattern for companies’ decision-making processes through big data\nanalytics.\n3. Method and data collection\nWith the aim to enrich current debate about the role of big data in companies’ decision-making, a case study approach was set as the research strategy (\nKohlbacher, 2016 ). The\nreasons why this approach was chosen are multi-faceted. On the one hand, the approach\nfollows the recommendations of Yin (2003) , who described the importance of case study\nresearch when a contemporary phenomenon is investigated in its real-world setting, and the\nboundaries between the phenomena itself and the related context are blurred. As a matter\nof fact, this method allows for a variety of research methods ( Yin, 2003 ;Kohlbacher, 2016 ).\nCase studies allow researchers to combine different data sources (such as interviews, texts\nand observations), as well as using qualitative and quantitative data analysis. Therefore,\nthey can be used to describe a phenomenon and Subsequently to develop and test\ntheories ( Darke et al. ,1 9 9 8 ).\nA widespread procedure is to use case studies in qualitative inquiries ( Stake, 2000 ;\nKohlbacher, 2016 ). This is especially relevant in contexts where the “why” and the “how” of\na phenomenon are the focus of an investigation. Consequently, a case study research\nstrategy with a qualitative inquiry thus seems to be an appropriate approach for an\ninvestigation and the provision of new insights. It is therefore unsurprising that case studies\nare an appropriate and popular way of investigating the implementation and use of\ninformation systems within organisations. This is particularly true in information systems\nresearch and related scientific areas, in which it is quite important to examine and\nunderstand the context of the phenomenon, because often researchers are unclear about\nhow a phenomenon arises or how individuals’ experiences and doings are critical to its\nactions and effects. Furthermore, numerous research approaches demand that with\nregards to the research question the number and topic of the cases must be determined at\nthe outset. Whilst a single case study is applied to gain deep and rich insights, multi-case\nstudies have the advantage of allowing replications (literal, theoretical) and comparisons\nbetween cases ( Darke et al. ,1 9 9 8 ).\nHere, a topic highly related to information systems research is investigated. Besides\nmanagerial and human factors, the research question also aims to understand the technical\nissues and their related problems. Following the recommendations given in the literature, as\ndescribed previously, a multiple case study research strategy was chosen as an appropriate\napproach in line with our research question. As the research focuses on different aspects, a\nsingle case study approach did not seem to be appropriate to best gain the desired insights\nabout the subject. Therefore, multiple cases were investigated by collecting different data from\ndifferent sources and conducting a qualitative analysis ( Yin, 1994 ,2012 ).\nPAGE 2802jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 7]\nConsequently, three different cases were examined. The investigated cases were a\nmanufacturing enterprise, an enterprise from the IT sector and a supplier for IT solutions. It\nis assumed that all the branches are equally affected by the challenges of implementing big\ndata analytics. In addition, the cases highlight and clarify that all sectors are affected by thechallenges of Big Data Analysis. The IT sector is no exception. The investigated enterpriseshave different sizes and turnovers. This circumstance is useful in terms of the generalisabilityof the findings. More details about the companies’ characteristics are reported in\nTable 1 .\nIn all cases, the process to implement the possibility of big data analytics was accompanied\nand supported by at least one of the researchers. As a result, a minimum of one person wasinvolved as an “action researcher” within the organisations (\nWalsham, 1995 ). Subsequently,\nboth the data and the contextual insights gathered are very rich and useful. Every case wascomprehensively investigated and hence a strong understanding of the phenomenon wasachieved (\nDarke et al. , 1998 ). Furthermore, the action researchers accompanied different big\ndata analytics projects within the companies chosen as cases. This allowed them to prove andcontrol the generalisability of the insights and findings in different settings (\nDarke et al. ,1 9 9 8 ).\nAs recommended in the literature, different data sources such as observations, interviews andquestionnaires were picked-up and combined (\nDarke et al. ,1 9 9 8 ). An overview about the data\nsources used in this investigation is provided in Table 1 .\nFor the data analysis, the Grounded Theory approach was conducted ( Strauss and Corbin,\n1994 ). This approach is very common and widespread in Information Systems research\n(Aarnikoivu et al. , 2019 ). In the first step, the open coding process was conducted. The data\nwas investigated, and the relevant aspects were tagged with abstract labels. This step isfollowed by the so-called axial coding process. As the second step of the procedure, the axialcoding process examines the relationships between the labels and tries to build networkscontaining relevant aspects. Hence, the identified labels were aggregated and networks werebuilt. In the third step, selective coding was applied, meaning that the networks were subsumedinto categories. In each step, all the team members did the coding process alone and theresults were discussed afterwards.\n4. Results\nThe data analysis revealed that in all cases along the project’s timeline specific patternsoccurred at special points in time. The findings are summarised in\nTable 2 and explained in\nmore detail subsequently.\nPhase (a) : Nearly all enterprises have recognised that the customer data they own is a\nhidden gem. Hence, it is not surprising that companies want to exploit this potential.\nConsequently, organisations have recognised the need for big data analytics to realise thebenefits provided by the data. Often, the top management takes the initiative to createplans for big data analytics projects. They set ambitious goals and objectives thatfrequently consist of a mix of dreams, wishes and reality. In many cases, the intended big\nTable 1 Overview of case studies and data gathering process\nEnterprise no. 1 (case 1) Enterprise no. 2 (case 2) Enterprise no. 3 (case 3)\nSector Manufacturing IT IT solution supplier\nCompany size Large Medium Small\nNo. employees >550 >200 63\nTurnover /C24200 Mio e /C24200 Mio e /C245 Mio e\nObservations by accompanying/supportive researcher x x x\nCross-divisional e-mail traffic x x x\nInterviews and expert talks x x x\nSurveys x\nSource: Authors’ elaboration\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2803\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 8]\ndata analytics projects are not realisable for several reasons. Firstly, the company lacks\nconcrete processes, possibilities and outcomes along with the initial vague and imaginative\nassumptions. Hence, big data analytics projects begin similarly and specific requirements\nare often not respected because of the company’s inexperience with such projects.\nSubsequently, wrong estimations in terms of budget and staffing, as well as time and scope\noccur. In addition, some of the most prominent aspects in big data analytics projects are also\nneglected. Furthermore, the availability of data is a crucial factor that is often misjudged.\nOrganisations trust in their databases. However, it is not uncommon for data to be unusable\nbecause of poor data management and questionable data quality. There are also often\nassumptions about data sources that do not, in fact, exist in the reality of the company. In one\nof the cases in this study, an expert in case (1) stated that the management proclaimed that all\nthe needed data is stored and available in their proAlpha ERP system. However, it turned out\nthat this was a false estimation from the management. Even if the data is available, wrong\njudgement can be taken as case (3) revealed. The responsible persons in case (3) assumed\nthat they have high quality data about their customers and their behaviour. Although data\nabout the customers was available, it did not meet the requirements. Relevant aspects of\ncustomers’ behaviour were missing and, therefore, the potential for the analysis was quite\nrestricted.\nPhase (b) : Once a project is started, challenges because of human factors, as well as\ntechnical issues arise. On the human side, the challenges are two fold. On the one hand, it\nmight be that the assigned employees did not have the relevant knowledge for conducting\nthe project or cannot be identified. During the project, the management of case (1)\ndiscovered that their internal staff were not able to implement the AI models into their\nsystems. Therefore, they had to find an external service provider who was able to cope with\nthis challenge. On the other hand, missing openness and/or a restricted mindset are a\ncritical human factor too. This often results in staff hiding their knowledge to avoid changes\nthat could lead to more work or that has a negative impact on their job position.\nBesides challenges occurring because of human factors, we also observed technical\naspects that were crucial for the continuation of big data analytics projects. On the technicalTable 2 Main results about companies ’approach to big data\nPhase (a.): Before/at the\nbeginning of the project Phase (b.): During the projectPhase (c.): At the\nend/ finalization of\nthe project\nNeed for big data analytics Staff with adequate\nknowledge is missing or\ncannot be foundNot all requirements/\nautomation tasks\ncan be fulfilled\nMix-up of dreams, wishes\nand realityMissing openness/restricted\nmindsetPredictions by the\nalgorithms are not\nalways better than\nthe human ones\nBudget and available staff Data sources (e.g.\ndatabases) do not fitUsability issues\nImplementation/time horizon Identification of the best Big\nData algorithm(s)Time, costs and\neffort was\nunderestimated (run\nof time and budget)\nTrust in databases Must re-design the project\nand re-start\nIT infrastructure is old and notflexible\nData protection rules\nSource: Authors’ elaboration\nPAGE 2804jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 9]\nside, it might be the case that the database does not even contain the expected data or that\nthe data did not fit the requirements, as described previously. In many cases, the missingdata cannot be procured because the IT infrastructure is too old and inflexible. There aremissing interfaces, hence new analytical systems can connect to it and collect the dataneeded for analysis (in cases 1, 2 and 3). Modern standard application interfaces like\nREST-APIs (\nMasse, 2011 ) were not provided, which hindered the seamless collection of\ndata. Furthermore, the implementation of modern big data analytic and data visualisationtools into old systems might be difficult.\nBoth human and technical factors might stop or delay the project. In all cases (1) –(3), it was\nhard to find the correct experts with business and domain specific know-how. In cases (1)\nand (3), often the most suitable employees for the task were also not known by themanagement. Sometimes, a step back to the first phase was needed to re-define theresponsibilities and even the technical possibilities. In cases (1) and (3), the project had to\nbe restarted (a). In case (1), adjustments during the project were done. Hence, the aim of\nthe project must be reviewed and re-defined. Another aspect that sometimes occurs is thatthe best algorithm cannot be found. In all cases (1) –(3), there was no generally available\nalgorithm or approach fitting the project’s goal that would deliver a result within theexpected quality range from the very beginning. Furthermore, the available IT infrastructure\nresources (e.g. CPU, RAM, disk) for the analysis hindered the evaluation of different\nalgorithms. For example, (sample) data was split and patterns were reconstructed toevaluate the algorithms. Different algorithms were combined in all cases to accommodateissues such as linear and non-linear behaviour (e.g. linear regression and neuronal\nnetworks) and selected based on different rules (rule-based algorithm selection and\ncombination), as well as patterns that could only be identified during the actual dataanalysis. For instance, after starting the project the responsible persons in case (1) foundout that their systems could not be used to run analytical services. They did not anticipate in\nadvance that the necessary infrastructure capabilities (e.g. CPU/RAM) would be missing.\nPhase (c) : In the final project phase, further patterns were identified within the selected\ncases. Regarding the definition and targets of the big data projects at the beginning of theproject, not all requirements and automation tasks could be fulfilled. This is often a\nconsequence of the fact that the challenges from the two preceding project phases could\nnot be sufficiently taken into account. In cases (1) and (3), only a minor set of requirementscould be fulfilled because of the issues in the prior project phases. It was only in case (2)that important requirements during the project could be delivered. Sometimes, theprediction of human experts with years of experience is faster and more accurate\ncompared to the developed systems. This might mean that not all the necessary data is\navailable and the data behaviour patterns may not be recognised by the system. This wasparticularly true of case (1), where the system was not accurate compared to experiencedexperts. The developed big data systems are very complex. Therefore, their usability and\nuser friendliness are severely limited. Experts must configure the systems in advance by\nentering specific parameters. Consequently, the staff must be trained to use the system andto interpret the results with regards to the specific business demands. In all cases (1) –(3),\nthe effort needed to complete the project, in terms of, for instance, time, costs and budget,\nwas underestimated. In cases (1) and (3), the project ran out of time and budget and had to\nbe adjusted. Again, this might be a consequence of the identified patterns in the first twophases of the projects (a) and (b).\n5. Conclusions, implications, limitations and future research\nIn the past few years, big data and big data analytics tools have been presented as the new“miracle” for efficiency, survival and increased performance for any type of organisedentities (\nSchmarzo, 2013 ). These approaches attracted the interest of multiple researchers\nand the investment of multiple companies interested in the possibility of obtaining multiple\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2805\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 10]\nadvantages by simply buying new instruments, software and digital devices. Despite the\nsummarised scenarios, the proposed research shows different scenarios in which collectedand analysed data demonstrate that the predictions made by the algorithms do not\nnaturally offer value in isolation. Sometimes, human predictions are even better because\nthey can involve more variable factors and are more intuitive.\nIn such a perspective, the research offers several practical implications because it\nunderlines how automation may not even be possible, and several manual steps are\nneeded as the usability of the tool decreases. Sometimes, users cannot work with the\nsystem because it is hard to handle or because they are not able to interpret the output of\nthe system and relate it to adequate strategical or operational measures. In addition,because of delays and re-definitions the project may run out of time and budget. Thus, the\nexpenses overcome the estimated benefit. Sometimes, projects must even be abandoned.\nFurthermore, issues related, e.g. to the technical foundation of the enterprises, used\nalgorithms and data quality hinder a good implementation and positive value of the system.\nIn the same perspective, the research also underlines several theoretical implications by\nascertaining that to run a big data analytics project successfully it is important to focus on the\nchallenges and anticipate consequences. Therefore, current interpretative paths and managerial\nmodels require radical rethinking to better catch and depict the interconnections that could be\npossible between humans and technology.\nDespite the conceptual and empirical advancements in the knowledge offered by the\nreflections herein, several limitations can be identified with reference to the proposed research\napproach because the results offered by the analyses of the case studies are subjective and\nrelated to the background in which they have been approached and analysed. In such a vein,the next steps for the research are required to test to what extent the proposed results and\nobservations can be generalised to different cognitive and geographical domains.\nReferences\nAarnikoivu, M., Nokkala, T., Siekkinen, T., Kuoppala, K. and Pekkola, E. (2019), “Working outside\nacademia? Perceptions of early-career, fixed-term researchers on changing careers”, European Journal\nof Higher Education , Vol. 9, pp. 172-189.\nAlter, S. (2006), The Work System Method: Connecting People, Processes, and IT for Business Results ,\nWork System Method.\nAmendola, C., Calabrese, M. and Caputo, F. (2018), “Fashion companies and customer satisfaction: a\nrelation mediated by information and communication technologies”, Journal of Retailing and Consumer\nServices , Vol. 43, pp. 251-257.\nArdito, L., Scuotto, V., Del Giudice, M. and Petruzzelli, A.M. (2018), “A bibliometric analysis of\nresearch on big data analytics for business and management”, Management Decision ,V o l .5 7\nNo. 8, pp. 1993-2009.\nBaker, O. and Thien, C.N. (2020), “A new approach to use big data tools to substitute unstructured data\nwarehouse”, 2020 IEEE Conference on Big Data and Analytics (ICBDA) , IEEE, pp. 26-31.\nBayer, S., Gimpel, H. and Rau, D. (2020), “IoT-commerce-opportunities for customers through an\naffordance lens”, Electronic Markets , Vol. 31 No. 1, pp. 27-50.\nBlumberg, R. and Atre, S. (2003), “The problem with unstructured data”, Dm Review , Vol. 13, pp. 42-49.\nBoonstra, A. (2003), “Structure and analysis of IS decision-making processes”, European Journal of\nInformation Systems , Vol. 12 No. 3, pp. 195-209.\nBuneman, P., Davidson, S., Fernandez, M. and Suciu, D. (1997), “Adding structure to unstructured data”,\nInternational Conference on Database Theory, Springer, pp. 336-350.\nCaputo, F., Cillo, V., Candelo, E. and Liu, Y. (2019a), “Innovating through digital revolution: the role\nof soft skills and big data in increasing firm performance”, Management Decision , Vol. 57 No. 8,\npp. 2032-2051.\nPAGE 2806jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 11]\nCaputo, F., Evangelista, F., Perko, I. and Russo, G. (2017), “The role of big data in value co-creation for\nthe knowledge economy”, in Vrontis, S., Weber, T., Tsoukatos, E. (Eds), Global and National Business\nTheories and Practice: bridging the past with the Future , EuroMed Press, pp. 269-280.\nCaputo, F., Garcia-Perez, A., Cillo, V. and Giacosa, E. (2019b), “A knowledge-based view of people and\ntechnology: directions for a value co-creation-based learning organisation”, Journal of Knowledge\nManagement , Vol. 3 No. 7, pp. 1314-1334.\nCaputo, F., Walletzky, L. and S ˇtep /C19anek, P. (2019c), “Towards a systems thinking based view for the\ngovernance of a smart city’s ecosystem”, Kybernetes , Vol. 48 No. 1, pp. 108-123.\nCastells, M. (1999), The Social Implications of Information and Communication Technologies ,UNESCO ’s\nWorld Social Science Report .\nChaston, I., Badger, B. and Sadler-Smith, E. (2001), “Organizational learning: an empirical assessment of\nprocess in small UK manufacturing firms”, Journal of Small Business Management ,V o l .3 9N o .2 ,\npp. 139-151.\nChen, H., Chiang, R.H. and Storey, V.C. (2012), “Business intelligence and analytics: from big data to big\nimpact”, MIS Quarterly , pp. 1165-1188.\nChinnaswamy, A., Papa, A., Dezi, L. and Mattiacci, A. (2018), “Big data visualisation, geographic\ninformation systems and decision making in healthcare management”, Management Decision , Vol. 57\nNo. 8, pp. 1937-1959.\nCitroen, C.L. (2011), “The role of information in strategic decision-making”, International Journal of\nInformation Management , Vol. 31 No. 6, pp. 493-501.\nDarke, P., Shanks, G. and Broadbent, M. (1998), “Successfully completing case study research:\ncombining rigour, relevance and pragmatism”, Information Systems Journal , Vol. 8 No. 4, pp. 273-289.\nDaunt, K.L. and Harris, L.C. (2012), “Motives of dysfunctional customer behavior: an empirical study”,\nJournal of Services Marketing , Vol. 26 No. 4, pp. 293-308.\nDavenport, T.H., Barth, P. and Bean, R. (2012), “How big data is different”, MIT Sloan Management\nReview , Vol. 54 No. 1, pp. 43-46.\nDavenport, T., Guszcza, J., Smith, T. and Stiller, B. (2021), Analytics and AI-Driven Enterprises Thrive in\nthe Age of With , Deloitte Insights.\nDel Giudice, M., Scuotto, V., Papa, A., Tarba, S.Y., Bresciani, S. and Warkentin, M. (2021), “A self-tuning\nmodel for smart manufacturing SMEs: effects on digital innovation”, Journal of Product Innovation\nManagement , Vol. 38 No. 1, pp. 68-89.\nDemangeot, C. and Broderick, A.J. (2006), “Exploring the experiential intensity of online shopping\nenvironments”, Qualitative Market Research: An International Journal , Vol. 9 No. 4, pp. 325-351.\nDemchenko, Y., De Laat, C. and Membrey, P. (2014), “Defining architecture components of the big data\necosystem”, 2014 International Conference on Collaboration Technologies and Systems (CTS) ,IEEE ,\npp. 104-112.\nDemchenko, Y., Zhao, Z., Grosso, P., Wibisono, A. and De Laat, C. (2012), “Addressing big data\nchallenges for scientific data infrastructure”, 4th IEEE International Conference on Cloud Computing\nTechnology and Science Proceedings ,IEEE , pp. 614-617.\nDrucker, P.F. (2011), Technology, Management, and Society , Harvard Business Press.\nErevelles, S., Fukawa, N. and Swayne, L. (2016), “Big data consumer analytics and the transformation of\nmarketing”, Journal of Business Research , Vol. 69 No. 2, pp. 897-904.\nEuropean Commission (2014), “Communication from the commission to the European parliament, the\ncouncil, the european economic and social committee and the committee of the regions”, available at:\nhttps://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:52014DC0442andfrom=EN\nEuropean Commission (2015), “Communication from the commission to the European parliament, thecouncil, the European economic and social committee and the committee of the regions”, A Digital SingleMarket Strategy for Europe, available at:\nhttps://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=\nCELEX:52015DC0192andfrom=EN\nForester, T. (1987), High-Tech Society: The Story of the Information Technology Revolution , MIT Press.\nGandomi, A. and Haider, M. (2015), “Beyond the hype: big data concepts, methods, and analytics”,\nInternational Journal of Information Management , Vol. 35 No. 2, pp. 137-144.\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2807\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 12]\nGoulding, C. (1999), “Consumer research, interpretive paradigms and methodological ambiguities”,\nEuropean Journal of Marketing , Vol. 33 Nos 9/10, pp. 859-873.\nGrant, A.M. and Mayer, D.M. (2009), “Good soldiers and good actors: prosocial and impression\nmanagement motives as interactive predictors of affiliative citizenship behaviors”, Journal of Applied\nPsychology , Vol. 94 No. 4, pp. 900-920.\nGriffin, M., Babin, B.J. and Modianos, D. (2000), “Shopping values of Russian consumers: the impact of\nhabituation in a developing economy”, Journal of Retailing , Vol. 76 No. 1, pp. 33-52.\nGuiot, D. and Roux, D. (2010), “A second-hand shoppers’ motivation scale: antecedents, consequences,\nand implications for retailers”, Journal of Retailing , Vol. 86 No. 4, pp. 355-371.\nGumusluoglu, L. and Ilsev, A. (2009), “Transformational leadership, creativity, and organizational\ninnovation”, Journal of Business Research , Vol. 62 No. 4, pp. 461-473.\nHajian, S. and Domingo-Ferrer, J. (2012), “A methodology for direct and indirect discrimination\nprevention in data mining”, IEEE Transactions on Knowledge and Data Engineering ,V o l .2 5N o .7 ,\npp. 1445-1459.\nHarbart, T. (2021), “Tapping the power of unstructured data”, MIT Sloan Management School,\navailable at: https://mitsloan.mit.edu/ideas-made-to-matter/tapping-power-unstructured-data\nHicks, B.J., Culley, S.J. and McMahon, C.A. (2006), “A study of issues relating to information managementacross engineering SMEs”, International Journal of Information Management , Vol. 26 No. 4, pp. 267-289.\nKaur, S., Gupta, S., Singh, S.K. and Perano, M. (2019), “Organizational ambidexterity through global\nstrategic partnerships: a cognitive computing perspective”, Technological Forecasting and Social\nChange , Vol. 145, pp. 43-54.\nKohlbacher, F. (2016), “The use of qualitative content analysis in case study research”, Forum Qualitative\nSozialforschung/Forum: Qualitative Social Research , Vol. 7 No. 1, pp. 1-30.\nManogaran, G., Thota, C. and Lopez, D. (2022), “Human-computer interaction with big data analytics”,\nResearch Anthology on Big Data Analytics, Architectures, and Applications, IGI Global, pp. 1578-1596.\nMarkus, M.L. and Topi, H. (2015), Big Data, Big Decisions for Science, Society, and Business , National\nScience Foundation.\nMasse, M. (2011), REST API Design Rulebook: designing Consistent RESTful Web Service Interfaces ,\nO’Reilly Media.\nMo¨hring, M., Keller, B., Schmidt, R. and Dacko, S. (2020), “Google popular times: towards a better\nunderstanding of tourist customer patronage behavior”, Tourism Review , Vol. 76 No. 3, pp. 553-593.\nMo¨hring, M., Walsh, G., Schmidt, R., Koot, C. and Ha ¨rting, R.C. (2013), “Returns management in\neCommerce”, HMD , Vol. 50 No. 5, pp. 66-75.\nMummalaneni, V. (2005), “An empirical investigation of web site characteristics, consumer emotional\nstates and on-line shopping behaviors”, Journal of Business Research , Vol. 58 No. 4, pp. 526-532.\nNutt, P.C. (2008), “Investigating the success of decision making processes”, Journal of Management\nStudies , Vol. 45 No. 2, pp. 425-455.\nPac¸o, A. and Lavrador, T. (2017), “Environmental knowledge and attitudes and behaviours towards\nenergy consumption”, Journal of Environmental Management , Vol. 197, pp. 384-392.\nPapadakis, V.M., Lioukas, S. and Chambers, D. (1998), “Strategic decision-making processes: the role of\nmanagement and context”, Strategic Management Journal , Vol. 19 No. 2, pp. 115-147.\nPauleen, D.J. and Wang, W.Y. (2017), “Does big data mean big knowledge? KM perspectives on big\ndata and analytics”, Journal of Knowledge Management , Vol. 21 No. 1, pp. 1-6.\nPolyakova, A., Loginov, M., Serebrennikova, A. and Thalassinos, E. (2019), “Design of a socio-economic\nprocesses monitoring system based on network analysis and big data”, International Journal of\nEconomics and Business Administration , Vol. 7 No. 1, pp. 30-139.\nRahrovani, Y. and Pinsonneault, A. (2020), “Innovative IT use and innovating with IT: a study of the\nmotivational antecedents of two different types of innovative behaviors”, Journal of the Association for\nInformation Systems , Vol. 21 No. 4, pp. 5-14.\nRaisinghani, M.S. (2000), “Knowledge management: a cognitive perspective on business and\neducation”, American Business Review , Vol. 18 No. 2, pp. 105-131.\nPAGE 2808jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\n\n[Página 13]\nReinsel, D., Gantz, J. and Rydning, J. (2018), The Digitization of the World. From Edge to Core ,A nI D C\nWhite Paper, Seagate.\nRokka, J. and Uusitalo, L. (2008), “Preference for green packaging in consumer product choices –do\nconsumer’s care?”, International Journal of Consumer Studies , Vol. 32 No. 5, pp. 516-525.\nSabherwal, R. and King, W.R. (1995), “An empirical taxonomy of the decision-making processes\nconcerning strategic applications of information systems”, Journal of Management Information Systems ,\nVol. 11 No. 4, pp. 177-214.\nSagiroglu, S. and Sinanc, D. (2013), “Big data: a review”, 2013 International Conference on Collaboration\nTechnologies and Systems (CTS) ,IEEE , pp. 42-47.\nSapsford, R. and Jupp, V. (Eds) (1996), Data Collection and Analysis , Sage.\nSchmarzo, B. (2013), Big Data: Understanding How Data Powers Big Business , John Wiley and Sons.\nSchmidt, R. and Mo ¨hring, M. (2013), “Strategic alignment of cloud-based architectures for big data”,\n2013 17th IEEE International Enterprise Distributed Object Computing Conference ,IEEE , pp. 136-143.\nSharma, R., Mithas, S. and Kankanhalli, A. (2014), “Transforming decision-making processes: a research\nagenda for understanding the impact of business analytics on organisations”, European Journal of\nInformation Systems , Vol. 23 No. 4, pp. 433-441.\nSingh, S.K. and Del Giudice, M. (2019), “Big data analytics, dynamic capabilities and firm performance”,\nManagement Decision , Vol. 57 No. 8, pp. 1729-1733.\nStake, R.E. (2000), “Case studies”, in Denzin, N.K and Lincoln, Y.S (Eds), Handbook of Qualitative\nResearch , Sage, pp. 435-453.\nStonebraker, M. (2010), “SQL databases v. NoSQL databases”, Communications of the ACM ,V o l .5 3\nNo. 4, pp. 10-11.\nStrauss, A. and Corbin, J. (1994), “Grounded theory methodology: an overview”, in Denzin, N.K. and\nLincoln, Y.S. (Eds), Handbook of Qualitative Research , Sage, pp. 273-285.\nTurban, E., McLean, E. and Wetherbe, J. (1998), Information Technology for Management Making\nConnections for Strategic Advantage , John Wiley and Sons, Inc.\nWalsham, G. (1995), “Interpretive case studies in IS research: nature and method”, European Journal of\nInformation Systems , Vol. 4 No. 2, pp. 74-81.\nYang, Q., Steinfeld, A. and Zimmerman, J. (2019), “Unremarkable AI: fitting intelligent decision support\ninto critical, clinical decision-making processes”, Proceedings of the 2019 CHI Conference on Human\nFactors in Computing Systems , pp. 1-11.\nYin, R.K. (1994), “Designing single-and multiple-case. Improving educational management: through\nresearch and consultancy”, in Bennett, N., Glatter, R. and Levacic, R. (Eds), Improving Educational\nManagement: Through Research and Consultancy , Sage, pp. 135-155.\nYin, R.K. (2003), Case Study Research, Design and Methods , 3rd ed., Sage, Vol. 5.\nYin, R.K. (2012), “Case study methods”, in Cooper, H., Camic, P.M., Long, D.L., Panter, A.T., Rindskopf,\nD. and Sher, K.J. (Eds), APA Handbook of Research Methods in Psychology , Vol. 2.Research Designs:\nQuantitative, Qualitative, Neuropsychological, and Biological , American Psychological Association,\npp. 141-155.\nZakir, J., Seymour, T. and Berg, K. (2015), “Big data analytics”, Issues in Information Systems ,V o l .1 6\nNo. 2, pp. 81-90.\nCorresponding author\nFrancesco Caputo can be contacted at: francesco.caputo2@unina.it\nFor instructions on how to order reprints of this article, please visit our website:\nwww.emeraldgrouppublishing.com/licensing/reprints.htm\nOr contact us for further details: permissions@emeraldinsight.com\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2809\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025",
+ "b4a5ecc8-6a2e-4362-8b31-3d798162b3c6": {
+ "content": "Advancing beyond technicism\nwhen managing big data in\ncompanies ’decision-making\nFrancesco Caputo, Barbara Keller, Michael Möhring, Luca Carrubbo andRainer Schmidt\nAbstract\nPurpose –In recognising the key role of business intelligence and big data analytics in influencing\ncompanies’ decision-making processes, this paper aims to codify the main phases through which\ncompanies can approach, develop and manage big data analytics.\nDesign/methodology/approach –By adopting a research strategy based on case studies, this paper\ndepicts the main phases and challenges that companies ‘‘live’’ through in approaching big data analytics\nas a way to support their decision-making processes. The analysis of case studies has been chosen as\nthe main research method because it offers the possibility for different data sources to describe aphenomenon and subsequently to develop and test theories.\nFindings –This paper provides a possible depiction of the main phases and challenges through which\nthe approach(es) to big data analytics can emerge and evolve over time with reference to companies’decision-making processes.\nResearch limitations/implications –This paper recalls the attention of researchers in defining clear\npatterns through which technology-based approaches should be developed. In its depiction of the main\nphases of the development of big data analytics in companies’ decision-making processes, this paper\nhighlights the possible domains in which to define and renovate approaches to value. The proposed\nconceptual model derives from the adoption of an inductive approach. Despite its validity, it is discussedand questioned through multiple case studies. In addition, its generalisability requires further discussion\nand analysis in the light of alternative interpretative perspectives.\nPractical implications –The reflections herein offer practitioners interested in company management\nthe possibility to develop performance measurement tools that can evaluate how each phase can\ncontribute to companies’ value creation processes.\nOriginality/value –This paper contributes to the ongoing debate about the role of digital technologies in\ninfluencing managerial and social models. This paper provides a conceptual model that is able to\nsupport both researchers and practitioners in understanding through which phases big data analytics\ncan be approached and managed to enhance value processes.\nKeywords Big data, Big data analytics, Companies’ decision-making, Smarter management\nPaper type Technical paper\n1. Preliminary reflections\nIn the past few decades, socio-economic configurations have profoundly changed because\nof the increasing use and accessibility of Information and Communication Technologies\n(ICT) in multiple domains of everyday life ( Forester, 1987 ;Turban et al. ,1 9 9 8 ;Drucker, 2011 ;\nCaputo et al. , 2019b ). Consolidated views based on the representation of technologies for\ndata management as “simple instruments” for supporting decision-making activities have\nprogressively shown that they are incapable of explaining ongoing dynamics and trends\n(Caputo et al. , 2019c ). Similarly, new interpretative approaches and managerial models are\nstrongly required by researchers and practitioners interested in effectively understandingFrancesco Caputo is based at\nthe Department of Economics,Management and Institutions,University of Naples Federico\nII, Naples, Italy.\nBarbara Keller is based at theDuale Hochschule Baden-Wu¨rttemberg Stuttgart,\nStuttgart, Germany.Michael Mo ¨hring is based at\nthe Department of Informatics –\nHHZ Reutlingen University,Reutlingen, Germany.Luca Carrubbo is based at theDepartment of Managementand Innovation Systems,University of Salerno, Salerno,Italy.\nRainer Schmidt is based at the\nDepartment of ComputerScience and Mathematics,University of Applied SciencesMunich, Munich, Germany.\nReceived 8 October 2022\nRevised 26 January 2023Accepted 25 February 2023\nCorrigendum : It has come to\nthe attention of the publisher\nthat the article: Caputo, F.,\nKeller, B., Mo ¨hring, M.,\nCarrubbo, L. and Schmidt, R.(2023), “Advancing beyondtechnicism when managing bigdata in companies’ decision-making”, Journal of Knowledge\nManagement , Vol. ahead-of-\nprint No. ahead-of-print.\nhttps://\ndoi.org/10.1108/JKM-10-2022-\n0794 did not accurately display\nMo¨hring, M.‘s affiliation.\nOur guidelines state that\naffiliations should be supplied\nin full when the article issubmitted.\nThe city corresponding to\nReutlingen University has been\namended from Munich toReutlingen.\nDOI10.1108/JKM-10-2022-0794 VOL. 27 NO. 10 2023, pp. 2797-2809, ©Emerald Publishing Limited, ISSN 1367-3270 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2797\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nwhat the main implications, consequences and effects of the increasing use of ICT in\nbusiness and social dynamics are ( Castells, 1999 ;Markus and Topi, 2015 ).\n[2015] Building upon this widely recognised need, in recent decades, a challenging debate\nhas emerged around the topic of big data analytics as “a way of extracting value from thesehuge volumes of information, and it drives new market opportunities and maximizes\ncustomer retention” (\nZakir et al. ,2 0 1 5 , p. 81). Several contributions have been provided\nwith reference to the multiple advantages that it is possible to obtain for companies from a“new” approach in the collection, coding and management of data related to the multiple\ndimensions of shopping expeditions and evaluations (\nGriffin et al. ,2 0 0 0 ;Mummalaneni,\n2005 ;Demangeot and Broderick, 2006 ;Amendola et al. , 2018 ;Ardito et al. ,2 0 1 8 ). Multiple\nstimuli for reflections have also been provided with reference to the ways in which people,processes and technologies can be combined to improve the quality of companies’ and\nmarkets’ approaches in data collection and use (\nAlter, 2006 ;Singh and Del Giudice, 2019 ).\nAs effectively summarised by Demchenko et al. (2012 , p. 614), “Data Science is becoming\na new technology driver and requires re-thinking a number of infrastructure, components,solutions and processes to address the following general challenges: Exponential growth of\ndata volume produced by different research instruments and/or collected from sensors;\nNeed to consolidate e-Infrastructure as [a] persistent research platform to ensure researchcontinuity and oration, deliver/offer persistent services, with [an] adequate governancemodel.” According to the authors’ reflections, the challenging domain about big data should\nmainly refer to the infrastructure and processes required for ensuring the effective collection\nand organisation of a huge volume of data.\nDespite the relevance of the aforementioned dimensions, it only represents a “small” part of\nthe multiple reflections that seem to require the ongoing transitions towards a knowledge\nera based on technology infrastructure. Several relevant elements related to human\napproaches to big data, the consequences of big data analytics in companies’ decision-making processes and the antecedents capable of addressing the ongoing digital transition(\nCaputo et al. , 2019a ;Chinnaswamy et al. ,2 0 1 8 ), among others, seem to be vastly\nunderestimated. Accordingly, the paper proposes extending current perspectives in the\nstudy of big data analytics by focusing attention on the intriguing domain of big dataanalytics, specifically “the extraction of hidden sight about consumer behaviour from bigdata and the exploitation of that insight through advantageous interpretation” (\nErevelles\net al. ,2 0 1 6 , p. 897). Thanks to the adoption of a research strategy based on case studies,\nthe paper aims to depict the main phases that companies face in the process of reshapingdecision-making processes through big data analytics. The analysis of case studies has\nbeen chosen as the main research method because it offers the possibility for different data\nsources to describe a phenomenon and subsequently to develop and test theories.\nThe paper is structured as follows. In Section 2, the theoretical background will be\npresented by focusing attention on smart management and on the role of big data analytics\nin companies’ decision-making processes as relevant domains with reference to which\nproposed reflections have been developed. In Section 3, the method and data collection ofthe proposed research will be reported, whilst in Section 4, the results of the proposedresearch will be summarised to enrich the current debate about the role of big data\nanalytics in reshaping companies’ decision-making processes. Finally, in Section 5, the\nstudy’s preliminary conclusions, main limitations, implications and possible future directionswill be presented.\n2. Theoretical background\nThe way in which organisations apply data analysis has changed over time ( Chen et al. ,\n2012 ). In recent years, different methods have been developed that depend on the different\ndata sources and related data structures.\nPAGE 2798jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nIn general, different data sources with structured and/or unstructured data can be part of\nbig data projects ( Gandomi and Haider, 2015 ). In the past, enterprises were only able to\nanalyse structured datasets like customer order data coming from, for example, CRM orERP systems (\nChen et al. , 2012 ). The data used for analyses mainly consisted of numbers\nor categorial variables, for example. The way of collecting, storing and analysing data was\nless complex in comparison to more recent data sources containing unstructured data(\nBuneman et al. , 1997 ;Blumberg and Atre, 2003 ;Baker and Thien, 2020 ;Del Giudice et al. ,\n2021 ). Today, however, up to 90% of the collected data is unstructured data like texts,\nimages, audio and video ( Harbart, 2021 ). The analysis of unstructured data is currently\nchallenging organisations because of its unsuitability for use in conventional data models(\nHarbart, 2021 ). The use of unstructured data together with structured data is manifold. For\ninstance, it can be used to improve the quality and the possibilities of prediction within big\ndata analytics ( Davenport et al. , 2021 ). Nevertheless, the more data types are included in\nanalytical projects, the more different methods must be used. Today, more and more IoT-related data sources like connected home appliances (\nBayer et al. , 2020 ) or services like\nGoogle Popular times ( Mo¨hring et al. ,2 0 2 0 ) can be used to predict and better understand\ncustomer behaviour. These new data sources must be integrated within the analytical\nlandscape to be used in related analysis. Another interesting use case that highlights thechallenges of the benefits of big data analytics is product returns in e-commerce. This fieldis even more important because it meets both customer behaviour and the sustainability\nconcept, as well as helping to easily understand the facets appearing in big data analysis.\nFor instance, if an organisation wants to use online customer reviews (unstructured textualdata) to predict the product returns probability (\nSchmidt and Mo ¨hring, 2013 ;Mo¨hring et al. ,\n2013 ), past customer order data from the CRM and ERP system (structured data) as well as\nimages (unstructured image data) from offered goods should also be integrated into the\nanalysis to enhance the quality of the prediction. Therefore, they must apply differentmethods like text mining for textual data, image pattern recognition for images and\ntraditional data mining techniques like regression or correlation analysis. In turn, this means\nthat different results, various key figures and quality criteria must be aggregated andharmonised within one comprehensive result (\nKaur et al. , 2019 ).\nFurthermore, the data must be stored in different locations like relational databases for the\norder data and/or within NoSql databases ( Stonebraker, 2010 ) like document-based\ndatabases for textual data. In sum, all these requirements will increase the complexity of big\ndata analytics projects and generate challenges for organisations running an analytical\nproject. In line with the identified methodological complexity and storing issues, thecomputational complexity also increases. The more variables are included in analyticalapproaches, the more steps for information processing and result calculation are\nnecessary. Therefore, organisations that are considering applying big data analytics must\nexplore the option of scalable public cloud computing services at major sites like AmazonAWS, Microsoft Azure and Google Cloud to capture the limitations of traditional non-scalable systems (\nSchmidt and Mo ¨hring, 2013 ).\n2.1 Challenges and dynamics of smart management\nNowadays, the dynamics in decision-making in all contexts are increasingly guided and\nconditioned by the reception, filtering, processing and use of data ( Raisinghani, 2000 ). The\nevolution of new technologies favours the development of virtuous processes [thanks to bigdata analytics techniques, data mining, machine learning, artificial intelligence (AI), etc.]\nthat support decision-making processes (\nNutt, 2008 ;Yang et al. , 2019 ). The growing\nuncertainty in all application areas accentuates the importance of the way in whichdecisions are made, especially if they involve significant consequences for the community.Decision making is a multidisciplinary topic that lends itself to different levels of analysis\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2799\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nwhen we focus on the various elements (including technological ones) that condition or\nfacilitate it ( Papadakis et al. ,1 9 9 8 ).\nDecision-making processes are increasingly data-driven. Therefore, the decisions are more\n“informed” because the exchange of information is rapid (often in real-time); hence, it canbe precise, punctual, efficient and valid. From the electronic medical record (now fully\noperational) to the development of information systems to new communication protocols, it\nis possible to record a continuous flow of data, information (and the contained in it) and the\ninputs to be filtered, processed, used and managed in a timely manner (\nSharma et al. ,\n2014 ). The risk of “data-deluge” and the difficulty of having useful elements is very high,\nwhile the possibility of making quick, accurate, thoughtful decisions becomes more and\nmore necessary, indeed fundamental ( Sabherwal and King, 1995 ;Citroen, 2011 ). In this\nsense, the evolution of decision-support-systems (DSS) assumes increasing importance inmany critical “moments”, both for descriptive-analytics (e.g. diagnostics, evaluations and\nmonitoring), as well as in the follow-up analytics in the operational phases and even for\nforecasting possible choices in the future and related reasons through predictive analyticsand prescriptive analytics (\nBoonstra, 2003 ).\nIn general, information sharing with shared databases, data-storage, data extraction and data\nprocessing favours the design of a more functional, versatile, scalable, context-friendly service\nprovision, where the smart management can make a difference thus deserves to be furtherexplored. For this, it becomes important to study the main characteristics of data that can be\nacquired. Here, the so-called “10V[s] of big-data” (Volume, Velocity, Variety, Veracity, Value,\nValidity, Variability, Venue, Vocabulary and Vagueness) are often taken into consideration tounderstand how new knowledge is generated and, consequently, how much decision-making\nprocesses are affected; particularly with reference to the possible advantages of meta dating,\ndata modelling, architecture and data integration (\nManogaran et al. ,2 0 2 2 ).\nSimilarly, the most frequently used methods are studied to improve decision-making from\ndata management’s point of view. Typical topics of interest here are cloud computing for\ninformation sharing, artificial intelligence for the data interpretation available and the\ngeneration of new ones like data mining and machine learning. The aim here is to betterunderstand how the information flow works, what criticalities it presents, how it feeds the\nactivation and management of known protocols, how it integrates the various data-sources\nand how it supports the management of queries (\nHicks et al. ,2 0 0 6 ).\nAll this effectively integrates decision-making techniques (cost benefits, grid analysis, paired\ncomparison, compensatory strategies, etc.), with particular reference to conditions of uncertainty\nbecause of, for example, systematic errors, cognitive biases, risk situations, external distortions,\ninformation asymmetries, misalignments, internal friction, misunderstandings, technical oradministrative misunderstandings, legal aspects, technological crashes or even weak signals\nescaping, somatic markers and negative contingencies.\nThese issues are so fundamental and interesting that in the period between 2021 and 2027,\nEuropean investments will be geared towards building a smarter Europe throughinnovation, digitalisation, economic transformation and support for small- and medium-\nsized enterprises. EIT Digital has launched the 2022 call to promote entrepreneurship and\neducation for the construction of a strong digital Europe and contribute to the developmentof digital technology, digital industry, digital cities, digital wellbeing and digital finance.\nSince 2014, the European Commission has spoken out in favour of a thriving data-driven\neconomy (\nEuropean Commission, 2014 ); in 2015, it discussed a strategy for the digital\nsingle market in Europe ( European Commission, 2015 ). In 2018, the International Data\nCorporation estimated an increase of 16 trillion gigabytes of data, with an annual growth\nrate of 236% in terms of data generation to date; they linked this to the fact that decisions\nbased on knowledge generated by big data can lead to increased productivity andcompetitiveness and GDP (equal to 1.9% by 2020) (\nReinsel et al. , 2018 ).\nPAGE 2800jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nToday, the evolving trend of Big Data Analyses is an integral part of a new digital market.\nAccording to the European Commission, it guarantees the development of innovative and\ncompetitive business models. However, while having to comply with the EU data protectionframework, big data can involve significant risks and challenges, especially in fundamentalrights like privacy and data protection. More recently, the European Parliament discussedthe role of the data-based economy in the strategy for the digital union against the backgroundof all stakeholders and their daily life situations, such as consumers (ease of use, efficiency\nand savings), businesses (industry 4.0) and public administration (e-government), housing\n(smart cities), science, medicine (Mhealth), disaster response capacity and the fight againstcrime, etc.\n2.2 Big data in companies’ decision-making processes\nScientists and researchers have long since faced the challenges of data management,focusing their attention on possible ways to collect data both directly and indirectly(\nSapsford and Jupp, 1996 ;Hajian and Domingo-Ferrer, 2012 ). Several experiments have\nbeen conducted aiming to define the processes and protocols that enhance the\neffectiveness of data collection as a relevant way to extend consolidated knowledge aboutthe reasons, antecedents and motivations behind actors’ behaviours and decisions inmultiple domains (\nGrant and Mayer, 2009 ;Guiot and Roux, 2010 ;Daunt and Harris, 2012 ;\nRahrovani and Pinsonneault, 2020 ). Along this line, studies focusing on companies’\ndecision-making have also been developed and multiple approaches for collecting andanalysing data have been investigated (\nGoulding, 1999 ;Rokka and Uusitalo, 2008 ;Pac¸o\nand Lavrador, 2017 ).\nNowadays, all these approaches and contributions seem to be outmoded against the\nbackground of the disruptive role of big data analytics in the data and knowledge managementprocesses (\nPauleen and Wang, 2017 ). Today, big data infrastructure supports the handling of\ndata operations by facilitating the source’s integration and collaboration in real time with highstandards for control and data safety (\nSagiroglu and Sinanc, 2013 ).\nDemchenko et al. (2014 , p. 105) reports “the Big Data definition as having the following 5V\nproperties: Volume, Velocity, Variety that constitute native/original Big Data properties, andValue and Veracity as acquired as a result of data[’s] initial classification and processing in\nthe context of a specific process or model.” These properties effectively summarise the\nrelevant contributions that big data can provide the management of a high volume of data inreal time without “damaging” the granularity of information to ensure a realisticrepresentation of the phenomenon (\nPolyakova et al. ,2 0 1 9 ).\nAccording to Erevelles et al. (2016) , the properties of big data seem to provide a valuable\nsolution for organisations striving to find an answer to environmental and social changesthrough predictive approaches about market trends. More comprehensively, big data offersorganisations the opportunities to increase:\n/H17039their dynamic capabilities –their “ability to respond to change incorporates skills and\nknowledge embedded within the organization to alter existing resources and createnew value” ( Erevelles et al. , 2016 , pp. 898 –899); and\n/H17039adaptive capabilities –as capabilities that do not derive “from a specific change in\norganizational structure but from the overall ability to capture consumer activities andextract hidden in-sights” ( Erevelles et al. , 2016 , p. 899).\nRecognising the disruptive role of big data in reinventing firms’ market approaches, it is\npossible to underline its contribution in supporting enterprises in innovating theirrelationships with the market by focusing on the “implementation of creative ideas”(\nGumusluoglu and Ilsev, 2009 , p. 61). From this perspective, big data analytics can be seen\nas a valuable approach that supports firms to enforce their relationship by focusing on the\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2801\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\ndefinition of the innovation management path based on their “ability to effectively acquire\nand exploit new information” ( Chaston et al. ,2 0 0 1 , p. 147). Data acquisition and exploitation\nbecame the bridge with the capacity to link innovation management, information management\nand market analysis under the common umbrella of big data analytics; this offers the\nopportunity to understand current interest in developing an effective model for information\nmanagement, allowing firms to better understand (and predict) market trends and\nexpectations based on big data analytics ( Erevelles et al. ,2 0 1 6 ).\nIn a nutshell, big data can be considered a disruptive innovation ( Caputo et al. ,2 0 1 7 ) that is\npotentially able to reinvent firms’ approach to market analysis. Accordingly, Davenport et al.\n(2012 , p. 43) stated that big data supports firms “to understand their business environments\nat a more granular level, [ ...] creating new products and services, and [ ...] responding\nmore quickly to change as it occurs.” As a result, a new challenge emerges concerning how\nto decode the pattern for companies’ decision-making processes through big data\nanalytics.\n3. Method and data collection\nWith the aim to enrich current debate about the role of big data in companies’ decision-making, a case study approach was set as the research strategy (\nKohlbacher, 2016 ). The\nreasons why this approach was chosen are multi-faceted. On the one hand, the approach\nfollows the recommendations of Yin (2003) , who described the importance of case study\nresearch when a contemporary phenomenon is investigated in its real-world setting, and the\nboundaries between the phenomena itself and the related context are blurred. As a matter\nof fact, this method allows for a variety of research methods ( Yin, 2003 ;Kohlbacher, 2016 ).\nCase studies allow researchers to combine different data sources (such as interviews, texts\nand observations), as well as using qualitative and quantitative data analysis. Therefore,\nthey can be used to describe a phenomenon and Subsequently to develop and test\ntheories ( Darke et al. ,1 9 9 8 ).\nA widespread procedure is to use case studies in qualitative inquiries ( Stake, 2000 ;\nKohlbacher, 2016 ). This is especially relevant in contexts where the “why” and the “how” of\na phenomenon are the focus of an investigation. Consequently, a case study research\nstrategy with a qualitative inquiry thus seems to be an appropriate approach for an\ninvestigation and the provision of new insights. It is therefore unsurprising that case studies\nare an appropriate and popular way of investigating the implementation and use of\ninformation systems within organisations. This is particularly true in information systems\nresearch and related scientific areas, in which it is quite important to examine and\nunderstand the context of the phenomenon, because often researchers are unclear about\nhow a phenomenon arises or how individuals’ experiences and doings are critical to its\nactions and effects. Furthermore, numerous research approaches demand that with\nregards to the research question the number and topic of the cases must be determined at\nthe outset. Whilst a single case study is applied to gain deep and rich insights, multi-case\nstudies have the advantage of allowing replications (literal, theoretical) and comparisons\nbetween cases ( Darke et al. ,1 9 9 8 ).\nHere, a topic highly related to information systems research is investigated. Besides\nmanagerial and human factors, the research question also aims to understand the technical\nissues and their related problems. Following the recommendations given in the literature, as\ndescribed previously, a multiple case study research strategy was chosen as an appropriate\napproach in line with our research question. As the research focuses on different aspects, a\nsingle case study approach did not seem to be appropriate to best gain the desired insights\nabout the subject. Therefore, multiple cases were investigated by collecting different data from\ndifferent sources and conducting a qualitative analysis ( Yin, 1994 ,2012 ).\nPAGE 2802jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nConsequently, three different cases were examined. The investigated cases were a\nmanufacturing enterprise, an enterprise from the IT sector and a supplier for IT solutions. It\nis assumed that all the branches are equally affected by the challenges of implementing big\ndata analytics. In addition, the cases highlight and clarify that all sectors are affected by thechallenges of Big Data Analysis. The IT sector is no exception. The investigated enterpriseshave different sizes and turnovers. This circumstance is useful in terms of the generalisabilityof the findings. More details about the companies’ characteristics are reported in\nTable 1 .\nIn all cases, the process to implement the possibility of big data analytics was accompanied\nand supported by at least one of the researchers. As a result, a minimum of one person wasinvolved as an “action researcher” within the organisations (\nWalsham, 1995 ). Subsequently,\nboth the data and the contextual insights gathered are very rich and useful. Every case wascomprehensively investigated and hence a strong understanding of the phenomenon wasachieved (\nDarke et al. , 1998 ). Furthermore, the action researchers accompanied different big\ndata analytics projects within the companies chosen as cases. This allowed them to prove andcontrol the generalisability of the insights and findings in different settings (\nDarke et al. ,1 9 9 8 ).\nAs recommended in the literature, different data sources such as observations, interviews andquestionnaires were picked-up and combined (\nDarke et al. ,1 9 9 8 ). An overview about the data\nsources used in this investigation is provided in Table 1 .\nFor the data analysis, the Grounded Theory approach was conducted ( Strauss and Corbin,\n1994 ). This approach is very common and widespread in Information Systems research\n(Aarnikoivu et al. , 2019 ). In the first step, the open coding process was conducted. The data\nwas investigated, and the relevant aspects were tagged with abstract labels. This step isfollowed by the so-called axial coding process. As the second step of the procedure, the axialcoding process examines the relationships between the labels and tries to build networkscontaining relevant aspects. Hence, the identified labels were aggregated and networks werebuilt. In the third step, selective coding was applied, meaning that the networks were subsumedinto categories. In each step, all the team members did the coding process alone and theresults were discussed afterwards.\n4. Results\nThe data analysis revealed that in all cases along the project’s timeline specific patternsoccurred at special points in time. The findings are summarised in\nTable 2 and explained in\nmore detail subsequently.\nPhase (a) : Nearly all enterprises have recognised that the customer data they own is a\nhidden gem. Hence, it is not surprising that companies want to exploit this potential.\nConsequently, organisations have recognised the need for big data analytics to realise thebenefits provided by the data. Often, the top management takes the initiative to createplans for big data analytics projects. They set ambitious goals and objectives thatfrequently consist of a mix of dreams, wishes and reality. In many cases, the intended big\nTable 1 Overview of case studies and data gathering process\nEnterprise no. 1 (case 1) Enterprise no. 2 (case 2) Enterprise no. 3 (case 3)\nSector Manufacturing IT IT solution supplier\nCompany size Large Medium Small\nNo. employees >550 >200 63\nTurnover /C24200 Mio e /C24200 Mio e /C245 Mio e\nObservations by accompanying/supportive researcher x x x\nCross-divisional e-mail traffic x x x\nInterviews and expert talks x x x\nSurveys x\nSource: Authors’ elaboration\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2803\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\ndata analytics projects are not realisable for several reasons. Firstly, the company lacks\nconcrete processes, possibilities and outcomes along with the initial vague and imaginative\nassumptions. Hence, big data analytics projects begin similarly and specific requirements\nare often not respected because of the company’s inexperience with such projects.\nSubsequently, wrong estimations in terms of budget and staffing, as well as time and scope\noccur. In addition, some of the most prominent aspects in big data analytics projects are also\nneglected. Furthermore, the availability of data is a crucial factor that is often misjudged.\nOrganisations trust in their databases. However, it is not uncommon for data to be unusable\nbecause of poor data management and questionable data quality. There are also often\nassumptions about data sources that do not, in fact, exist in the reality of the company. In one\nof the cases in this study, an expert in case (1) stated that the management proclaimed that all\nthe needed data is stored and available in their proAlpha ERP system. However, it turned out\nthat this was a false estimation from the management. Even if the data is available, wrong\njudgement can be taken as case (3) revealed. The responsible persons in case (3) assumed\nthat they have high quality data about their customers and their behaviour. Although data\nabout the customers was available, it did not meet the requirements. Relevant aspects of\ncustomers’ behaviour were missing and, therefore, the potential for the analysis was quite\nrestricted.\nPhase (b) : Once a project is started, challenges because of human factors, as well as\ntechnical issues arise. On the human side, the challenges are two fold. On the one hand, it\nmight be that the assigned employees did not have the relevant knowledge for conducting\nthe project or cannot be identified. During the project, the management of case (1)\ndiscovered that their internal staff were not able to implement the AI models into their\nsystems. Therefore, they had to find an external service provider who was able to cope with\nthis challenge. On the other hand, missing openness and/or a restricted mindset are a\ncritical human factor too. This often results in staff hiding their knowledge to avoid changes\nthat could lead to more work or that has a negative impact on their job position.\nBesides challenges occurring because of human factors, we also observed technical\naspects that were crucial for the continuation of big data analytics projects. On the technicalTable 2 Main results about companies ’approach to big data\nPhase (a.): Before/at the\nbeginning of the project Phase (b.): During the projectPhase (c.): At the\nend/ finalization of\nthe project\nNeed for big data analytics Staff with adequate\nknowledge is missing or\ncannot be foundNot all requirements/\nautomation tasks\ncan be fulfilled\nMix-up of dreams, wishes\nand realityMissing openness/restricted\nmindsetPredictions by the\nalgorithms are not\nalways better than\nthe human ones\nBudget and available staff Data sources (e.g.\ndatabases) do not fitUsability issues\nImplementation/time horizon Identification of the best Big\nData algorithm(s)Time, costs and\neffort was\nunderestimated (run\nof time and budget)\nTrust in databases Must re-design the project\nand re-start\nIT infrastructure is old and notflexible\nData protection rules\nSource: Authors’ elaboration\nPAGE 2804jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nside, it might be the case that the database does not even contain the expected data or that\nthe data did not fit the requirements, as described previously. In many cases, the missingdata cannot be procured because the IT infrastructure is too old and inflexible. There aremissing interfaces, hence new analytical systems can connect to it and collect the dataneeded for analysis (in cases 1, 2 and 3). Modern standard application interfaces like\nREST-APIs (\nMasse, 2011 ) were not provided, which hindered the seamless collection of\ndata. Furthermore, the implementation of modern big data analytic and data visualisationtools into old systems might be difficult.\nBoth human and technical factors might stop or delay the project. In all cases (1) –(3), it was\nhard to find the correct experts with business and domain specific know-how. In cases (1)\nand (3), often the most suitable employees for the task were also not known by themanagement. Sometimes, a step back to the first phase was needed to re-define theresponsibilities and even the technical possibilities. In cases (1) and (3), the project had to\nbe restarted (a). In case (1), adjustments during the project were done. Hence, the aim of\nthe project must be reviewed and re-defined. Another aspect that sometimes occurs is thatthe best algorithm cannot be found. In all cases (1) –(3), there was no generally available\nalgorithm or approach fitting the project’s goal that would deliver a result within theexpected quality range from the very beginning. Furthermore, the available IT infrastructure\nresources (e.g. CPU, RAM, disk) for the analysis hindered the evaluation of different\nalgorithms. For example, (sample) data was split and patterns were reconstructed toevaluate the algorithms. Different algorithms were combined in all cases to accommodateissues such as linear and non-linear behaviour (e.g. linear regression and neuronal\nnetworks) and selected based on different rules (rule-based algorithm selection and\ncombination), as well as patterns that could only be identified during the actual dataanalysis. For instance, after starting the project the responsible persons in case (1) foundout that their systems could not be used to run analytical services. They did not anticipate in\nadvance that the necessary infrastructure capabilities (e.g. CPU/RAM) would be missing.\nPhase (c) : In the final project phase, further patterns were identified within the selected\ncases. Regarding the definition and targets of the big data projects at the beginning of theproject, not all requirements and automation tasks could be fulfilled. This is often a\nconsequence of the fact that the challenges from the two preceding project phases could\nnot be sufficiently taken into account. In cases (1) and (3), only a minor set of requirementscould be fulfilled because of the issues in the prior project phases. It was only in case (2)that important requirements during the project could be delivered. Sometimes, theprediction of human experts with years of experience is faster and more accurate\ncompared to the developed systems. This might mean that not all the necessary data is\navailable and the data behaviour patterns may not be recognised by the system. This wasparticularly true of case (1), where the system was not accurate compared to experiencedexperts. The developed big data systems are very complex. Therefore, their usability and\nuser friendliness are severely limited. Experts must configure the systems in advance by\nentering specific parameters. Consequently, the staff must be trained to use the system andto interpret the results with regards to the specific business demands. In all cases (1) –(3),\nthe effort needed to complete the project, in terms of, for instance, time, costs and budget,\nwas underestimated. In cases (1) and (3), the project ran out of time and budget and had to\nbe adjusted. Again, this might be a consequence of the identified patterns in the first twophases of the projects (a) and (b).\n5. Conclusions, implications, limitations and future research\nIn the past few years, big data and big data analytics tools have been presented as the new“miracle” for efficiency, survival and increased performance for any type of organisedentities (\nSchmarzo, 2013 ). These approaches attracted the interest of multiple researchers\nand the investment of multiple companies interested in the possibility of obtaining multiple\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2805\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nadvantages by simply buying new instruments, software and digital devices. Despite the\nsummarised scenarios, the proposed research shows different scenarios in which collectedand analysed data demonstrate that the predictions made by the algorithms do not\nnaturally offer value in isolation. Sometimes, human predictions are even better because\nthey can involve more variable factors and are more intuitive.\nIn such a perspective, the research offers several practical implications because it\nunderlines how automation may not even be possible, and several manual steps are\nneeded as the usability of the tool decreases. Sometimes, users cannot work with the\nsystem because it is hard to handle or because they are not able to interpret the output of\nthe system and relate it to adequate strategical or operational measures. In addition,because of delays and re-definitions the project may run out of time and budget. Thus, the\nexpenses overcome the estimated benefit. Sometimes, projects must even be abandoned.\nFurthermore, issues related, e.g. to the technical foundation of the enterprises, used\nalgorithms and data quality hinder a good implementation and positive value of the system.\nIn the same perspective, the research also underlines several theoretical implications by\nascertaining that to run a big data analytics project successfully it is important to focus on the\nchallenges and anticipate consequences. Therefore, current interpretative paths and managerial\nmodels require radical rethinking to better catch and depict the interconnections that could be\npossible between humans and technology.\nDespite the conceptual and empirical advancements in the knowledge offered by the\nreflections herein, several limitations can be identified with reference to the proposed research\napproach because the results offered by the analyses of the case studies are subjective and\nrelated to the background in which they have been approached and analysed. In such a vein,the next steps for the research are required to test to what extent the proposed results and\nobservations can be generalised to different cognitive and geographical domains.\nReferences\nAarnikoivu, M., Nokkala, T., Siekkinen, T., Kuoppala, K. and Pekkola, E. (2019), “Working outside\nacademia? Perceptions of early-career, fixed-term researchers on changing careers”, European Journal\nof Higher Education , Vol. 9, pp. 172-189.\nAlter, S. (2006), The Work System Method: Connecting People, Processes, and IT for Business Results ,\nWork System Method.\nAmendola, C., Calabrese, M. and Caputo, F. (2018), “Fashion companies and customer satisfaction: a\nrelation mediated by information and communication technologies”, Journal of Retailing and Consumer\nServices , Vol. 43, pp. 251-257.\nArdito, L., Scuotto, V., Del Giudice, M. and Petruzzelli, A.M. (2018), “A bibliometric analysis of\nresearch on big data analytics for business and management”, Management Decision ,V o l .5 7\nNo. 8, pp. 1993-2009.\nBaker, O. and Thien, C.N. (2020), “A new approach to use big data tools to substitute unstructured data\nwarehouse”, 2020 IEEE Conference on Big Data and Analytics (ICBDA) , IEEE, pp. 26-31.\nBayer, S., Gimpel, H. and Rau, D. (2020), “IoT-commerce-opportunities for customers through an\naffordance lens”, Electronic Markets , Vol. 31 No. 1, pp. 27-50.\nBlumberg, R. and Atre, S. (2003), “The problem with unstructured data”, Dm Review , Vol. 13, pp. 42-49.\nBoonstra, A. (2003), “Structure and analysis of IS decision-making processes”, European Journal of\nInformation Systems , Vol. 12 No. 3, pp. 195-209.\nBuneman, P., Davidson, S., Fernandez, M. and Suciu, D. (1997), “Adding structure to unstructured data”,\nInternational Conference on Database Theory, Springer, pp. 336-350.\nCaputo, F., Cillo, V., Candelo, E. and Liu, Y. (2019a), “Innovating through digital revolution: the role\nof soft skills and big data in increasing firm performance”, Management Decision , Vol. 57 No. 8,\npp. 2032-2051.\nPAGE 2806jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nCaputo, F., Evangelista, F., Perko, I. and Russo, G. (2017), “The role of big data in value co-creation for\nthe knowledge economy”, in Vrontis, S., Weber, T., Tsoukatos, E. (Eds), Global and National Business\nTheories and Practice: bridging the past with the Future , EuroMed Press, pp. 269-280.\nCaputo, F., Garcia-Perez, A., Cillo, V. and Giacosa, E. (2019b), “A knowledge-based view of people and\ntechnology: directions for a value co-creation-based learning organisation”, Journal of Knowledge\nManagement , Vol. 3 No. 7, pp. 1314-1334.\nCaputo, F., Walletzky, L. and S ˇtep /C19anek, P. (2019c), “Towards a systems thinking based view for the\ngovernance of a smart city’s ecosystem”, Kybernetes , Vol. 48 No. 1, pp. 108-123.\nCastells, M. (1999), The Social Implications of Information and Communication Technologies ,UNESCO ’s\nWorld Social Science Report .\nChaston, I., Badger, B. and Sadler-Smith, E. (2001), “Organizational learning: an empirical assessment of\nprocess in small UK manufacturing firms”, Journal of Small Business Management ,V o l .3 9N o .2 ,\npp. 139-151.\nChen, H., Chiang, R.H. and Storey, V.C. (2012), “Business intelligence and analytics: from big data to big\nimpact”, MIS Quarterly , pp. 1165-1188.\nChinnaswamy, A., Papa, A., Dezi, L. and Mattiacci, A. (2018), “Big data visualisation, geographic\ninformation systems and decision making in healthcare management”, Management Decision , Vol. 57\nNo. 8, pp. 1937-1959.\nCitroen, C.L. (2011), “The role of information in strategic decision-making”, International Journal of\nInformation Management , Vol. 31 No. 6, pp. 493-501.\nDarke, P., Shanks, G. and Broadbent, M. (1998), “Successfully completing case study research:\ncombining rigour, relevance and pragmatism”, Information Systems Journal , Vol. 8 No. 4, pp. 273-289.\nDaunt, K.L. and Harris, L.C. (2012), “Motives of dysfunctional customer behavior: an empirical study”,\nJournal of Services Marketing , Vol. 26 No. 4, pp. 293-308.\nDavenport, T.H., Barth, P. and Bean, R. (2012), “How big data is different”, MIT Sloan Management\nReview , Vol. 54 No. 1, pp. 43-46.\nDavenport, T., Guszcza, J., Smith, T. and Stiller, B. (2021), Analytics and AI-Driven Enterprises Thrive in\nthe Age of With , Deloitte Insights.\nDel Giudice, M., Scuotto, V., Papa, A., Tarba, S.Y., Bresciani, S. and Warkentin, M. (2021), “A self-tuning\nmodel for smart manufacturing SMEs: effects on digital innovation”, Journal of Product Innovation\nManagement , Vol. 38 No. 1, pp. 68-89.\nDemangeot, C. and Broderick, A.J. (2006), “Exploring the experiential intensity of online shopping\nenvironments”, Qualitative Market Research: An International Journal , Vol. 9 No. 4, pp. 325-351.\nDemchenko, Y., De Laat, C. and Membrey, P. (2014), “Defining architecture components of the big data\necosystem”, 2014 International Conference on Collaboration Technologies and Systems (CTS) ,IEEE ,\npp. 104-112.\nDemchenko, Y., Zhao, Z., Grosso, P., Wibisono, A. and De Laat, C. (2012), “Addressing big data\nchallenges for scientific data infrastructure”, 4th IEEE International Conference on Cloud Computing\nTechnology and Science Proceedings ,IEEE , pp. 614-617.\nDrucker, P.F. (2011), Technology, Management, and Society , Harvard Business Press.\nErevelles, S., Fukawa, N. and Swayne, L. (2016), “Big data consumer analytics and the transformation of\nmarketing”, Journal of Business Research , Vol. 69 No. 2, pp. 897-904.\nEuropean Commission (2014), “Communication from the commission to the European parliament, the\ncouncil, the european economic and social committee and the committee of the regions”, available at:\nhttps://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:52014DC0442andfrom=EN\nEuropean Commission (2015), “Communication from the commission to the European parliament, thecouncil, the European economic and social committee and the committee of the regions”, A Digital SingleMarket Strategy for Europe, available at:\nhttps://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=\nCELEX:52015DC0192andfrom=EN\nForester, T. (1987), High-Tech Society: The Story of the Information Technology Revolution , MIT Press.\nGandomi, A. and Haider, M. (2015), “Beyond the hype: big data concepts, methods, and analytics”,\nInternational Journal of Information Management , Vol. 35 No. 2, pp. 137-144.\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2807\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nGoulding, C. (1999), “Consumer research, interpretive paradigms and methodological ambiguities”,\nEuropean Journal of Marketing , Vol. 33 Nos 9/10, pp. 859-873.\nGrant, A.M. and Mayer, D.M. (2009), “Good soldiers and good actors: prosocial and impression\nmanagement motives as interactive predictors of affiliative citizenship behaviors”, Journal of Applied\nPsychology , Vol. 94 No. 4, pp. 900-920.\nGriffin, M., Babin, B.J. and Modianos, D. (2000), “Shopping values of Russian consumers: the impact of\nhabituation in a developing economy”, Journal of Retailing , Vol. 76 No. 1, pp. 33-52.\nGuiot, D. and Roux, D. (2010), “A second-hand shoppers’ motivation scale: antecedents, consequences,\nand implications for retailers”, Journal of Retailing , Vol. 86 No. 4, pp. 355-371.\nGumusluoglu, L. and Ilsev, A. (2009), “Transformational leadership, creativity, and organizational\ninnovation”, Journal of Business Research , Vol. 62 No. 4, pp. 461-473.\nHajian, S. and Domingo-Ferrer, J. (2012), “A methodology for direct and indirect discrimination\nprevention in data mining”, IEEE Transactions on Knowledge and Data Engineering ,V o l .2 5N o .7 ,\npp. 1445-1459.\nHarbart, T. (2021), “Tapping the power of unstructured data”, MIT Sloan Management School,\navailable at: https://mitsloan.mit.edu/ideas-made-to-matter/tapping-power-unstructured-data\nHicks, B.J., Culley, S.J. and McMahon, C.A. (2006), “A study of issues relating to information managementacross engineering SMEs”, International Journal of Information Management , Vol. 26 No. 4, pp. 267-289.\nKaur, S., Gupta, S., Singh, S.K. and Perano, M. (2019), “Organizational ambidexterity through global\nstrategic partnerships: a cognitive computing perspective”, Technological Forecasting and Social\nChange , Vol. 145, pp. 43-54.\nKohlbacher, F. (2016), “The use of qualitative content analysis in case study research”, Forum Qualitative\nSozialforschung/Forum: Qualitative Social Research , Vol. 7 No. 1, pp. 1-30.\nManogaran, G., Thota, C. and Lopez, D. (2022), “Human-computer interaction with big data analytics”,\nResearch Anthology on Big Data Analytics, Architectures, and Applications, IGI Global, pp. 1578-1596.\nMarkus, M.L. and Topi, H. (2015), Big Data, Big Decisions for Science, Society, and Business , National\nScience Foundation.\nMasse, M. (2011), REST API Design Rulebook: designing Consistent RESTful Web Service Interfaces ,\nO’Reilly Media.\nMo¨hring, M., Keller, B., Schmidt, R. and Dacko, S. (2020), “Google popular times: towards a better\nunderstanding of tourist customer patronage behavior”, Tourism Review , Vol. 76 No. 3, pp. 553-593.\nMo¨hring, M., Walsh, G., Schmidt, R., Koot, C. and Ha ¨rting, R.C. (2013), “Returns management in\neCommerce”, HMD , Vol. 50 No. 5, pp. 66-75.\nMummalaneni, V. (2005), “An empirical investigation of web site characteristics, consumer emotional\nstates and on-line shopping behaviors”, Journal of Business Research , Vol. 58 No. 4, pp. 526-532.\nNutt, P.C. (2008), “Investigating the success of decision making processes”, Journal of Management\nStudies , Vol. 45 No. 2, pp. 425-455.\nPac¸o, A. and Lavrador, T. (2017), “Environmental knowledge and attitudes and behaviours towards\nenergy consumption”, Journal of Environmental Management , Vol. 197, pp. 384-392.\nPapadakis, V.M., Lioukas, S. and Chambers, D. (1998), “Strategic decision-making processes: the role of\nmanagement and context”, Strategic Management Journal , Vol. 19 No. 2, pp. 115-147.\nPauleen, D.J. and Wang, W.Y. (2017), “Does big data mean big knowledge? KM perspectives on big\ndata and analytics”, Journal of Knowledge Management , Vol. 21 No. 1, pp. 1-6.\nPolyakova, A., Loginov, M., Serebrennikova, A. and Thalassinos, E. (2019), “Design of a socio-economic\nprocesses monitoring system based on network analysis and big data”, International Journal of\nEconomics and Business Administration , Vol. 7 No. 1, pp. 30-139.\nRahrovani, Y. and Pinsonneault, A. (2020), “Innovative IT use and innovating with IT: a study of the\nmotivational antecedents of two different types of innovative behaviors”, Journal of the Association for\nInformation Systems , Vol. 21 No. 4, pp. 5-14.\nRaisinghani, M.S. (2000), “Knowledge management: a cognitive perspective on business and\neducation”, American Business Review , Vol. 18 No. 2, pp. 105-131.\nPAGE 2808jJOURNAL OF KNOWLEDGE MANAGEMENT jVOL. 27 NO. 10 2023\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025\nReinsel, D., Gantz, J. and Rydning, J. (2018), The Digitization of the World. From Edge to Core ,A nI D C\nWhite Paper, Seagate.\nRokka, J. and Uusitalo, L. (2008), “Preference for green packaging in consumer product choices –do\nconsumer’s care?”, International Journal of Consumer Studies , Vol. 32 No. 5, pp. 516-525.\nSabherwal, R. and King, W.R. (1995), “An empirical taxonomy of the decision-making processes\nconcerning strategic applications of information systems”, Journal of Management Information Systems ,\nVol. 11 No. 4, pp. 177-214.\nSagiroglu, S. and Sinanc, D. (2013), “Big data: a review”, 2013 International Conference on Collaboration\nTechnologies and Systems (CTS) ,IEEE , pp. 42-47.\nSapsford, R. and Jupp, V. (Eds) (1996), Data Collection and Analysis , Sage.\nSchmarzo, B. (2013), Big Data: Understanding How Data Powers Big Business , John Wiley and Sons.\nSchmidt, R. and Mo ¨hring, M. (2013), “Strategic alignment of cloud-based architectures for big data”,\n2013 17th IEEE International Enterprise Distributed Object Computing Conference ,IEEE , pp. 136-143.\nSharma, R., Mithas, S. and Kankanhalli, A. (2014), “Transforming decision-making processes: a research\nagenda for understanding the impact of business analytics on organisations”, European Journal of\nInformation Systems , Vol. 23 No. 4, pp. 433-441.\nSingh, S.K. and Del Giudice, M. (2019), “Big data analytics, dynamic capabilities and firm performance”,\nManagement Decision , Vol. 57 No. 8, pp. 1729-1733.\nStake, R.E. (2000), “Case studies”, in Denzin, N.K and Lincoln, Y.S (Eds), Handbook of Qualitative\nResearch , Sage, pp. 435-453.\nStonebraker, M. (2010), “SQL databases v. NoSQL databases”, Communications of the ACM ,V o l .5 3\nNo. 4, pp. 10-11.\nStrauss, A. and Corbin, J. (1994), “Grounded theory methodology: an overview”, in Denzin, N.K. and\nLincoln, Y.S. (Eds), Handbook of Qualitative Research , Sage, pp. 273-285.\nTurban, E., McLean, E. and Wetherbe, J. (1998), Information Technology for Management Making\nConnections for Strategic Advantage , John Wiley and Sons, Inc.\nWalsham, G. (1995), “Interpretive case studies in IS research: nature and method”, European Journal of\nInformation Systems , Vol. 4 No. 2, pp. 74-81.\nYang, Q., Steinfeld, A. and Zimmerman, J. (2019), “Unremarkable AI: fitting intelligent decision support\ninto critical, clinical decision-making processes”, Proceedings of the 2019 CHI Conference on Human\nFactors in Computing Systems , pp. 1-11.\nYin, R.K. (1994), “Designing single-and multiple-case. Improving educational management: through\nresearch and consultancy”, in Bennett, N., Glatter, R. and Levacic, R. (Eds), Improving Educational\nManagement: Through Research and Consultancy , Sage, pp. 135-155.\nYin, R.K. (2003), Case Study Research, Design and Methods , 3rd ed., Sage, Vol. 5.\nYin, R.K. (2012), “Case study methods”, in Cooper, H., Camic, P.M., Long, D.L., Panter, A.T., Rindskopf,\nD. and Sher, K.J. (Eds), APA Handbook of Research Methods in Psychology , Vol. 2.Research Designs:\nQuantitative, Qualitative, Neuropsychological, and Biological , American Psychological Association,\npp. 141-155.\nZakir, J., Seymour, T. and Berg, K. (2015), “Big data analytics”, Issues in Information Systems ,V o l .1 6\nNo. 2, pp. 81-90.\nCorresponding author\nFrancesco Caputo can be contacted at: francesco.caputo2@unina.it\nFor instructions on how to order reprints of this article, please visit our website:\nwww.emeraldgrouppublishing.com/licensing/reprints.htm\nOr contact us for further details: permissions@emeraldinsight.com\nVOL. 27 NO. 10 2023 jJOURNAL OF KNOWLEDGE MANAGEMENT jPAGE 2809\nDownloaded from http://www.emerald.com/jkm/article-pdf/27/10/2797/1834268/jkm-10-2022-0794.pdf by UECE user on 08 October 2025",
"metadata": {
"filename": "Advancing beyond technicism-2022.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\Advancing beyond technicism-2022.pdf",
- "file_size": 191470,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:33.619110",
- "content_length": 55999
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\Advancing beyond technicism-2022.pdf",
+ "size": 191470,
+ "source": "docs_to_import"
+ },
+ "id": "b4a5ecc8-6a2e-4362-8b31-3d798162b3c6"
},
- "38fd80e4-1db5-4f15-a691-9d1e300929d7": {
- "id": "38fd80e4-1db5-4f15-a691-9d1e300929d7",
- "content": "[Página 1]\nIssues in Big Data Testing and Benchmarking\nAlexander Alexandrov \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nalexander.alexandrov@tu-\nberlin.de Christoph Brücke \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nchristoph.bruecke@campus.tu-\nberlin.de Volker Markl \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nvolker.markl@tu- berlin.de \n \n \nABSTRACT \nThe academic community and industry are currently researching \nand building next generation data management systems. These \nsyste ms are designed to analyze data sets of high volume with \nhigh data ingest rates and short response time s executing complex \ndata analysis algorithms on data that does not adher e to relational \ndata model s. As these big data systems differ from standard \nrelational database systems with respect to data and workloads, \nthe traditional benchmarks used by the database community are insufficient. In this paper , we describe initial solutions and \nchallenges wit h respect to big data generation, methods for \ncreating realistic, privacy -aware, and arbitrarily scalable data sets, \nworkloads, and benchmarks from real world data. We will in \nparticular discuss why we feel that workloads currentl y discussed \nin the testing and benchmarking community do not capture the real complexity of big data and highlight several research \nchallenges with respect to massively -parallel data generation and \ndata characterization. \nCategories and Subject Descriptors \nD.2.5 [Testing and Debugging ]: testing tools, data generators \nGeneral Terms \nMeasurement, Performance, Experimentation \nKeywords \nBig Data, Data Generation, Data Profiling, Workloads, Benchmarking \n1. INTRODUCTION \nThe database systems building community is currently at a peak \nof new activity, creating novel systems for managing and \nanalyzing what is commonly called “big data.” Big data is usually \ncharacterized by the requirement to conduct advanced analytics on \nlarge volumes of data of variable format, wh ich is ingested into \nthe system with high -velocity with the need for fast response \ntimes. Novel big data analytics systems differ from traditional \ndata analysis systems for varying reasons, they : (a) c an process \nterabytes or even p etabytes of data due to their scale-o ut abilities, \nemployin g massively parallel processing , (b) support complex \ndata types in addition to relational sets of tuples (i.e., data of complex structure, such as text documents, hierarchies, graphs, or even images, audio, or video files) , (c) allow for defining and \nprocessing complex analytics tasks that go beyond the traditional \noperations of the relational algebra ( e.g., user -defined functions, \ndata mining or machine learning algorithms, graph algorithms) , \n(d) provide fault -tolerance in order to ensure termination even for \nlong-running computations , and (e) compute answers with low -\nlatency, producing r esults in a pipelined fashion. \nSome examples of systems that showcase several of these features \nare Google MapReduce [ DG04 ], its open source implementation \nHadoop [ Had13] , its ecosystem of languages (e.g ., Hive \n[TSJ+09 ], JAQL [ BEG+11 ], Pig [ ORS+08]) and libraries such as \nMahout [ Mah13], and other big data systems such as Asterix \n[ABG+12 ], GraphLab [ LBG+12 ], Spark [Spa13] and our own \nStratosphere system [ ABE+10 , Str13 ]. At the same time, there is a \ntrend to make more traditional relational data analysis sy stems \nmore scalable. Examples of these efforts are SAP Hana \n[FML +12], Impala [ Imp13 ], Oracle Exadata [ GSA+11 ], or the \ncolumnar storage extensions to Microsoft’s and IBM’s database products , to name a few. \nWhile all these systems have advanced the capabilit ies of data \nanalysis with respect to the five dimensions above, database testing and benchmarking have not moved forward to provide data \ngenerators, data sets, and workloads. In particular, we see the need \nto generate large, realistic data sets at scale, as well as the need for \nwell-defined workloads that capture the nature of novel, modern \nanalysis tasks. \n2. BIG DATA GENERATION \nData generation tools and practices can be principally assigned to \none of two classes : (a) reusing existing, well-known data \ngeneration tools, or (b ) implementing custom, use -case tailored \ndata generator s. We first review the benefits of each one of these \nclasses and then discuss some implications for the evaluation of \nbig data analytics systems. \nSince the establishment of standardized benchmarks as a “gold \nstandard” for performance evaluation of database systems in the \nearly 90’s , experimental results reported in research papers often \nreuse data sets and queries from well-known benchmarks , like \nTPC-H, TPC-C [TPC13 ], and XMLGen [ XML13 ]. This practice \nis justified by two main factors. First, the synthetic data used by \nstandardized or public benchmarks typically adheres to a short \ntextual specification that is well-known in the database \ncommunity . Reusing data sets from such benchmark s therefore \nmakes the data properties and their impact on the evaluated tasks \nmore comprehensible and increases the trust in the reported \nexperiment result s. Second , well-known benchmarks typically \nprovide open-source tools for data and workload generation , \nwhich can be adapted and used by third parties relatively easy . \nThis reduces the overall effort required to prepare and execute \nPermission to make digital or hard copies of all or part of this work for \npersonal or classroom use is granted without fee prov ided that copies are \nnot made or distributed for profit or commercial advantage and that \ncopies bear this notice and the full citation on the first page. To copy \notherwise, or republish, to post on servers or to redistribute to lists, \nrequires prior specif ic permission and/or a fee. \nDBTEST ’13, June 24, 201 3, New York City, NY, USA \nCopyright 2 013 ACM 1 -58113 -000-0/00/0010 …$15 .00.\n\n[Página 2]\n“proof -of-concept” experiments and allows researchers to spend \nmore time working on the actual prototype s rather than the tooling \nto evaluate them . \nAn alternative approach that sometimes is preferred for \nspecialized experimental studies is to define an d implement a \ncustom data generator tailored towards the requirements of the \nconcrete experiment s at hand . If the experiments are recognized as \nrelevant by the database community, the data and tasks described \nin the original research are often reused by other authors in \nfollow -up work. For example, Pavlo et al. followed this approach \nin their comparison of approaches for large -scale data analytics \n[PPR+ 09] and implemented a synthetic generator that generates a \ncollection of linked HTML documents and associated data (e.g., \nuser traffic , PageRank ). The data generator and the tasks have \nsince then been used in several other papers dealing with large -\nscale data analytics systems [ DQJ+10 , JOS+10 ]. For graph data, \nthe Kronecker multiplication approach suggested by Leskovec et \nal. [LC K+05] offers a simple algorithm for synthetic generation of \nunlabeled graphs with real world characteristics ( e.g., shrinking \ndiameter, skewed degree distribution) . Due to the lack of \npublic ally available real-world graph s in the terabyte range , \nKronecker graphs are often featured in the evaluation sections of \nseveral graph -mining papers over the past few years [ KTF 09, \nKTA +11]. \nPrincipally , the main issue with both classes is the inherent \nsimplicity in the statistical structure of the generated data. In the \nfirst case , this simplicity is driven by the need for concise and \nunderstandable specification for standardized benchmarks. In the \nsecond case, the main hindering factor is the complexity \nintroduced in the data generation programs by the need for \ncorrelated data and the amount of resources that researchers are \nwilling to invest in their development . \n \nFigure 1: Simplified Retail Database Schema \nIn reference to the characteristic s of new big data analysis systems \npresented in Section 1 , the use of oversimplified synthetic data \ncreates a subtle pitfall that may impact the relevance of research \nresults for real -world applications . The reason for this is that per \ndefinition such systems must work in a distributed execution \nenvironment (cluster or cloud), and also must use some form of \ndata-parallelism in order to ensure scale-out . These design \ndecisions are highly sensitive to data skew, which often is present \nin many target application domains “a priori” and potentially \nchanges over time. To illustrate the problem, consider the retail \ndatabase schema depicted on Figure 1 and a use -case, where the \nbenchmarks or experimental setup models an application that \nwants to compute the top -k most purchased items per product \ncategory. Since some product categories are n aturally more in demand than others, introducing a skew over the product category \ndistribution in the joined LINEITEM -PRODUCT view is critical \nto the relevance of the generated data. As most systems will \nprocess each product category g roup in parallel, skew will \nobviously influence system performance for this particular task. \nMoreover, for an online computation of the same counts in a streaming setting, the degree of skew will depend on the time of \nthe current window ( e.g., in the U.S. shopping peaks between \nThanksgiving Day & Christmas and attains a maximum on “Black \nFriday ”). In this case, assuming an evenly distributed load across \ntime is an oversimplif ication that can influence the relevance of \nthe experimental results for real -world applications. \nWith the advent of big data co\n mes the requirement to quickly \ngenerate huge data sets. This is particularly a challenge when \ngenerating data sets with key/foreign -key relationships or other \ncomplex correlations across tables. Using specialized random number generators with seed skipping allows for doing so in parallel without having to communicate data generated on one node of a shared -nothing cluster to another [RFS+10 , FPR12, \nASP+1 1, ATM12], resulting in toolkits such as PDGF [PDG13] \nor Myriad [Myr13] . Both toolkits provide a set of domain-specific \nprimitives for data generation that facilitate the transparent use of \nseed-skip PRNGs and complementary technique s for scalable \ngeneration of complex data . \n3. GENERATING REALISTIC DATA SETS \nThe advances in new methods for scalable generation of realistic \ndata highlight an important practical question: “If the data \ngenerator program can be expressed in terms of a small set of \nspecial primitives, then to which extent and in which scenarios \ncan the specification process itself be executed automatically ?” A \nnaïve general approach is based on the analysis of empirical \nobservations in the modeled domain and the subsequent synthesis \nof a data generator specification from these observations . In \nbusiness scenarios , however, the analysis is often done in the \ncontext of a reference dataset that represents a ground truth for the \nderived data generator. This section sketches our vision for an \nintegrated framework for such usage scenarios . We propose an \nextensible architecture with clean separation between the data \ngeneration primitives and the methods and techniques used to \nextract relevant features from the ground truth data set. \nA large problem for benchmarking and testing of big data system s \nis the lack of realistic data sets. Many synthetic data sets follow \nsimplistic assumptions ( e.g., few correlations, most ly uniform \ndistributions, over simplified schema ) that are not re presentative \nfor real-world data . A promising, generalizable, and more \neffective way is to automatically extract the domain information \nfrom a ground truth data set , which is often available in practice. \nFigure 2 illustrates our envisioned pipeline. The domain \ninformation is first extracted from the reference database in the form of domain constraints , which can be either hard (e.g., foreign \nkeys, unique keys, and other functional dependencies ) or soft \n(e.g., local statistical models) . The obtained structural, semantic, \nand statistical information is then unified into a n intermediate \nmodel representing the schema information with ann otated \nconstraints . A final synthesis pass transforms the intermediate \nrepresentation into a data generator specification for a specific \ntarget environment like the Myriad . This specification is then used \nto create a concrete data generator instance that is able to mimic \nthe original data set.\n\n[Página 3]\nFigure 2: A Pipeline for the A nalysis & Synthesis of D ata Generators \nWe note that in the first step of this process, the circumstances in \nwhich the analysis is performed will influence its depth and \nconsequently the quality of the collected domain information. \nIf the reference database cannot be accessed directly and the \ndomain information is available only in a derived form , such as in \na database catalog, the analysis must be performed indirectly and \ncan only extract the available catalog information . This \ninformation commonly consists of attribute value statistics ( e.g., \nfrequen cy values, histograms, number of distinct va lues, and \nnumber of NULLs) , schema information , and integrity constraints \n(e.g., referential integrity , primary key s, and unique constraints as \nwell as other constraints represent ing domain invariants ). \nAlternativ ely, if the reference database is available directly , \nadvanced profiling methods could be leveraged to obtain information beyond the catalog in order to capture a more accurate domain model . This approach will require us to \ndetermine additional characterizations of the dataset to be generated (e.g., advanced multivariate statistics [SHM+06] and \nsoft constraints [IMH+04, BH03, SBH+06]) on the data with \nscalable methods (see [HIL+09] for an overview of statistical methods, and [Nau13] for an overview of data profiling). Using \nthese techniques will allow for determining the essential characteristics of real -world data set s and correspondingly will \nenable one to scale up or down synthetic clones . \nThe integration of data profiling and data generation workflows is \nrelevant in the era of big data for a number of reasons . First, many \ninstitutions publish their data sets in order to let others perform \ntheir experiments on them. However, database sizes are becoming larger and larger . Conseque ntly, it is becom ing increasingly \ndifficult to transfer these huge data sets to the person wishing to \nuse them due to network and bandwidth constraints . Therefore, it \nis desirable to have a compact specification of the data sets, i.e. , a \nsynopsis or profile from which one can automatically generate a \ndata generator specification and thus the dataset . Second, data \nprofiling will increase the relevance tests or benchmarks. Huppler \n[Hup09] describes five key aspects for a good benchmark , namely \na good b enchmark has to be relevant, repeatable, fair, verifiable, \nand economical. Section 2 mainly addressed the latter one, while \ndata profiling will help to improv e the relevance. \nCurrently, w e are develo ping a prototype called Oligos [Oli13] \nthat adheres to our aforementioned vision. The initial version of Oligos can generate data generator specifications for the Myriad Toolkit [Myr13] from the system catalog of a database system. Our long -term vision is provide a modular API that will allow \nlearning advanced statistics and correlation information, in order \nto ge nerate even more realistic data sets. \n4. AN APPLICATION: REGRESSION \nTESTING OF BIG DATA SYSTEMS \nAn important part of the maintenance lifecycle of co mmercial big \ndata system s as well as general data management system s is \ndevoted to the diagnosis of performance regressions observed by customers in a production setting. When trying to reproduce the \nproblematic behavior in a test environment, database system \ndevelopers often face the problem of missing data – even though \nthe database schema and the problematic queries can be provided by the customer as part of the regression report, the actual \ndatabase instance typically cannot be obtained ( e.g., due to \nprivacy restrictions). T ypically , what is available is the database \ncatalog, which contains a statistical approximation of the \nreference database in the form of value distributions, cardinalities , \nand histograms on columns or column groups. As a fallback solution, developers currently trick the optimizer of a test database \nby feeding customer catalog data in order to obtain the query \naccess paths of the actual production system. As the underlying \ndata is missing and the database catalog is usually lacking crucial \ninformation (e.g., on multivariate distributions ) synthetic data sets \ngenerated in the lab are not representative. Thus, information on \nhow the query access paths perform requires further assistance \nand feedback from the client. The lack of a complete and \nrepresentative regression database therefore slows down the maintenance process and causes additional costs . The methods for \ndata generation based on data and workload characterization as \nenvisioned in Oligos and Myriad would offer a remedy to this \nproblem. \n5. OPEN ISSUES AND CONCLUSIONS \nWe have given an overview of issues in big data benchmarking \nand testing, with a strong focus on data generation. We believe that efficiently generating a huge, realistic data set is an important \nprerequisite for the advancement, evaluation, and fair comparison \nof big data systems. Myriad [Myr13], PDGF [DPG13] , and Oligos \n[Oli13 ] are a first step in this direction. However, in the context of \nbig data generation and benchmarking, a large number of \nchallenges remain open. \nHowever, in the context of big data generation and benchmarking, \na l\narge number of challenges remain open. For realistic data \ngeneration from a given reference dataset the challenges exist \nboth in the analysis and the synthesis phase. \nDuring the analysis phase, a combination of data characterization \nand profiling methods can be identified and applied in order to increase the quality of the dom ain information that can be inferred \ndirectly from the reference database. Such methods will allow to \nefficiently determine multi-key dependencies, in particular \nreferential integrity, as well as to profile data with complex structure (e.g., text, graphs, NF² and hierarchical data). In order to preserve privacy when conducting data profiling, data obfuscation \nmethods may as well be required. [Nau13] lists further challenges \nin the area of data profiling. \nInferred schema information and constraints must be then unified \ninto an intermediate representation (IR) in the synthesis phase. \nTwo problems exist in this context. First, in order to facilitate the subsequent translation of the IR into a data generator specification, the IR should lend itself to the features and \nprimitives common to the underlying data -generation engines. \nSecond, the unification process should determine and handle\n\n[Página 4]\ninconsistencies in the domain information collected in the analysis \nphase. Recently, Arasu [AKL11] and Torlak [Tor12] suggested \ntwo different constraint -based languages for data generator \nspecification that can serve as a starting point for the development of a suitable IR and synthesis algorithm. For both languages, the authors give sufficiency conditions for the existence of a data set \nfulfilling the input constraints and provide approximate \nalgorithms to find such an instance. The approach presented in \n[Tor12] uses a mix of hard (dimension or integrity) and soft \n(statistical) c onstraints and is restricted to dimension models, \nwhereas [AKL11] works on general relational models and relies \nsolely on soft (cardinality) constraints (hard constraints are \nrepresented implicitly as a special form of soft constraints). As the target lang uage in our setting is likely to include primitives that \ndirectly enforce certain types of hard constraints (e.g. unique keys, \nforeign keys), we believe that a distinction between soft and hard \nconstraints in the IR is a more promising approach. \nAnother big open area is the provisioning of workloads. \nTraditional benchmarks focus on simple workloads that \nessentially follow the relational algebra or an NF² algebra/ \nXQuery. For evaluating and testing big data analytics systems, we \nwill require m ore complex workloads that involve machine \nlearning algorithms, information extraction, and graph analysis/mining. The lack of a standardized data analysis language currently is a big obstacle for arriving at realistic, comparable, and universally useful w orkload specifications. Ideally, u ntil a \nstandardized declarative language is available use-case \nrepositories may be a first step in this direction. \n6. ACKNOWLEDGMENTS \nWe thank Berni Schiefer from IBM and Tillmann Rabl from the \nUniversity of Toronto for interesting discussions. Our \ninvestigations were funded by a CAS grant from IBM, the ICT \nLabs of the European Institute of Technology as well as the DFG \n(German National Science Foundation) via the Stratosphere Collabor ative Research Unit. \n7. REFERENCES \n[ABE+10] A. Alexandrov, D. Battré, S. Ewen, M. Heimel, F. \nHueske, O. Kao, V. Markl, E. Nijkamp, D. Warneke: Massively Parallel Data Analysis with PACTs on Nephele. PVLDB Vol. 3, No. 2, pp. 1625– 1628 \n(2010) \n[ABG+12] S. Als ubaiee, A. Behm, R. Grover, R. Vernica, V. \nBorkar, M. J. Carey, C. Li: ASTERIX: Scalable \nWarehouse -Style Web Data Integration. In \nProceedings of the Ninth International Workshop on \nInformation Integration on the Web, Article 2, ACM, (2012) \n[AKL11] A. Arasu, R. Kaushik, J. Li: Data Generation using \nDeclarative Constraints. Proceeding of the SIGMOD Conference, pp. 685-696 (2011) \n [ASP+1 1] A. Alexandrov, B. Schiefer, J. Poelman, S. Ewen, T. \nBodner, V. Markl: Myriad - Parallel Data Generation \non Shared-Not hing Architectures , In Proc. ASBD, pp. \n30-33 (2011) \n[ATM12] A. Ale xandrov, K. Tzoumas, V. Markl: Myriad: \nScalable and Expressive Data Generation , In Proc. \nVLDB(5) pp. 1890- 1893 ( 2012) \n[BH03] P. Brown, P. Haas: BHUNT: Automatic Discovery of \nFuzzy Algebraic Constraints in Relational Data. \nVLDB 2003: 668-679 [BEG+11] K. S. Beyer, V. Ercegovac, R. Gemulla, A. Balmin, M. Eltabakh, C.-C. Kanne, E. J. Shekita: Jaql: A scripting language for large scale semistructured data \nanalysis. In Proc. of VLDB Conference. (20 11) \n[DG04] J. Dean, S. Ghemawat: MapReduce: simplified data processing on large clusters , In OSDI, pp. 137 -150 \n(2004) \n[FML+ 12] F. Färber, N . May, W . Lehner, P . Große, I . Müller, H . \nRauhe, J . Dees: The SAP HANA Database -- An \nArchitecture Overview. IEEE Data Eng. Bull. 35(1): \n28-33 (2012) \n[FPR12] M. Frank, M. Poess, T. Rabl: Efficient update data generation for DBMS benchmarks. ICPE 2012: 169-180 \n[GSA+11 ] R. Greenwald, R. Stackowiak, M. Alam, M. Bhuller.. \nAchieving extreme performance with Oracle Exadata. McGraw -Hill Osborne Media (2011) \n[Had13] \nhttp://hadoop.apache.org/ , last accessed 05 -10-2013 \n[HIL+09] P. J. Haas, I. Ilyas, G. Lohman, V. Markl: Disco vering \nand Exploiting Statistical Properties for Query Optimization in Relational Databases: A Survey. Statistical Analysis and Data Mining 1(4): 223 -250 \n(2009) \n[Hup93 ] K. Huppler: The Art of Building a Good Benchmark. \nTPCTC 2009: 18- 30 (2009) \n[IMH+04] I. Ilyas, V . Markl, P . Haas, P . Brown, A . Aboulnaga: \nCORDS: Automatic Discovery of Cor relations and \nSoft Functional Dependencies. SIGMOD Conference 2004: 647-658 \n[Imp13] \nhttps://github.com/cloudera/impala , last accessed 05 -\n10-2013 \n[LBG+12] Y. Low, D. Bickson , J. Gonzalez, C. Guestrin , A. \nKyrola , J. M. Hellerstein: DistributedGraphLab: A \nframework for machine learning and data mining in \nthe cloud. Proceedings of the VLDB Endowment, \n5(8), pp. 716-727 (2012) \n[Mah13] Mahout: http://mahout.apache.org/ , last accessed 04 -\n21-2013 \n[Myr13] https://github.com/TU -Berlin-DIMA/myriad -\ntoolkit/wiki , last accessed 05 -10-2013 \n[Nau13] http://www.hpi.uni-\npotsdam.de/naumann/publications/publications_by_ty\npe/year/2013/2276/Nau13.html , SIGMOD Record \n(2013) \n[Oli13] https://github.com/TU -Berlin-DIMA/myriad -\ntoolkit/wiki/Using -Oligos -Guide , last accessed 05 -10-\n2013 \n[ORS+08] C. Olston, B. Reed, U. Srivastava, R. Kumar, A. \nTomkins: Pig Latin: A Not-So -Foreign Language for \nData Processing. Proceedings of the SIGMOD \nConference (SIGMOD), pp. 1099 -1110, (2008) \n[PDG13] http://www.paralleldatageneration.org/drupal6/ , last \naccessed 05 -10-2013 \n[RFS+10] T. Rabl, M . Frank, H . Sergieh, H . Kosch: A Data \nGenerator for Cloud-Scale Benchmarking. TPCTC 2010: 41- 56\n\n[Página 5]\n[SBH+06] Y. Sismanis, P. Brown, P. Haas, B. Reinwald: \nGORDIAN: Efficient and Scalable Discovery of Composite Keys. VLDB 2006: 691-702 \n[SHM+06] U. Srivastava, P. Haas, V . Markl, M . Kutsch, T .Tran: \nISOMER: Consistent Histogram Construction Using \nQuery Feedback. ICDE (2006) \n[Spa 13] \nhttp://spark -project.org/ , last accessed 05 -10-2013 \n[Str13] http://www.stratosphere.eu/ , last accessed 05 -10-2013 \n[Tor12] E. Torlak: Scalable test data generation from multidimensional models . Proceedings of the ACM \nSIGSOFT 20th International Symposium on the \nFoundations of S oftware Engineering (2012) \n[TPC13] \nhttp://www.tpc.org , last accessed 05 -10-2013 \n[TSJ+09] A. Thusoo, J. S.Sarma, N. Jain, Z. Shao, P.Chakka, S. Anthony, H. Liu, P. Wyckoff, R. Murthy: Hive - A \nWarehousing Solution Over a M ap-Reduce \nFramework. PVLDB 2(2), pp. 1626- 1629 (2009) \n[XML13] \nhttp://www.xml-benchmark.org/ , last accessed 05 -10-\n2013 [PPR+09] A. Pavlo, E . Paulson, A . Rasin, D. Abadi, D . DeWitt, \nS. Madden, M . Stonebraker: A comparison of \napproaches to large-scale data analysis. SIGMOD \nConference 2009: 165 -178 \n[DQJ+10] J. Dittrich, J . Quiané -Ruiz, A . Jindal, Y . Kargin, V . \nSetty, J . Schad: Hadoop++: Making a Yellow \nElephant Run Like a Cheetah (Without It Even \nNoticing). PVLDB 3(1): 518- 529 (2010) \n[JOS+10] D. Jiang, B . C. Ooi, L. Shi, S . Wu: The Performance \nof MapReduce: An In-depth Study. PVLDB 3(1):472-483 (2010) \n[LCK+05] J. Leskovec, D . Chakrabarti, J . Kleinberg, C . \nFaloutsos: Realistic, Mathematically Tractable Graph Gene ration and Evolution, Using Kronecker \nMultiplication. PKDD 2005: 133 -145 \n[KTF09] U. Kang, C .E. Tsourakakis, C . Faloutsos: PEGASUS: \nA Peta -Scale Graph Mining System. ICDM 2009: \n229-238 \n[KTA+11] U. Kang, C.E. Tsourakakis, A.P. Appel, C . Faloutsos, \nJ. Leskovec: HADI: Mining Radii of Large Graphs. \nTKDD 5(2): 8 (2011)",
+ "1dbda3bb-553e-4f98-9333-3464241cfcd5": {
+ "content": "Issues in Big Data Testing and Benchmarking\nAlexander Alexandrov \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nalexander.alexandrov@tu-\nberlin.de Christoph Brücke \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nchristoph.bruecke@campus.tu-\nberlin.de Volker Markl \nTechnische Universität Berlin \nEinsteinufer 17 \n10587 Berlin, Germany \n+49 30 314 23555 \nvolker.markl@tu- berlin.de \n \n \nABSTRACT \nThe academic community and industry are currently researching \nand building next generation data management systems. These \nsyste ms are designed to analyze data sets of high volume with \nhigh data ingest rates and short response time s executing complex \ndata analysis algorithms on data that does not adher e to relational \ndata model s. As these big data systems differ from standard \nrelational database systems with respect to data and workloads, \nthe traditional benchmarks used by the database community are insufficient. In this paper , we describe initial solutions and \nchallenges wit h respect to big data generation, methods for \ncreating realistic, privacy -aware, and arbitrarily scalable data sets, \nworkloads, and benchmarks from real world data. We will in \nparticular discuss why we feel that workloads currentl y discussed \nin the testing and benchmarking community do not capture the real complexity of big data and highlight several research \nchallenges with respect to massively -parallel data generation and \ndata characterization. \nCategories and Subject Descriptors \nD.2.5 [Testing and Debugging ]: testing tools, data generators \nGeneral Terms \nMeasurement, Performance, Experimentation \nKeywords \nBig Data, Data Generation, Data Profiling, Workloads, Benchmarking \n1. INTRODUCTION \nThe database systems building community is currently at a peak \nof new activity, creating novel systems for managing and \nanalyzing what is commonly called “big data.” Big data is usually \ncharacterized by the requirement to conduct advanced analytics on \nlarge volumes of data of variable format, wh ich is ingested into \nthe system with high -velocity with the need for fast response \ntimes. Novel big data analytics systems differ from traditional \ndata analysis systems for varying reasons, they : (a) c an process \nterabytes or even p etabytes of data due to their scale-o ut abilities, \nemployin g massively parallel processing , (b) support complex \ndata types in addition to relational sets of tuples (i.e., data of complex structure, such as text documents, hierarchies, graphs, or even images, audio, or video files) , (c) allow for defining and \nprocessing complex analytics tasks that go beyond the traditional \noperations of the relational algebra ( e.g., user -defined functions, \ndata mining or machine learning algorithms, graph algorithms) , \n(d) provide fault -tolerance in order to ensure termination even for \nlong-running computations , and (e) compute answers with low -\nlatency, producing r esults in a pipelined fashion. \nSome examples of systems that showcase several of these features \nare Google MapReduce [ DG04 ], its open source implementation \nHadoop [ Had13] , its ecosystem of languages (e.g ., Hive \n[TSJ+09 ], JAQL [ BEG+11 ], Pig [ ORS+08]) and libraries such as \nMahout [ Mah13], and other big data systems such as Asterix \n[ABG+12 ], GraphLab [ LBG+12 ], Spark [Spa13] and our own \nStratosphere system [ ABE+10 , Str13 ]. At the same time, there is a \ntrend to make more traditional relational data analysis sy stems \nmore scalable. Examples of these efforts are SAP Hana \n[FML +12], Impala [ Imp13 ], Oracle Exadata [ GSA+11 ], or the \ncolumnar storage extensions to Microsoft’s and IBM’s database products , to name a few. \nWhile all these systems have advanced the capabilit ies of data \nanalysis with respect to the five dimensions above, database testing and benchmarking have not moved forward to provide data \ngenerators, data sets, and workloads. In particular, we see the need \nto generate large, realistic data sets at scale, as well as the need for \nwell-defined workloads that capture the nature of novel, modern \nanalysis tasks. \n2. BIG DATA GENERATION \nData generation tools and practices can be principally assigned to \none of two classes : (a) reusing existing, well-known data \ngeneration tools, or (b ) implementing custom, use -case tailored \ndata generator s. We first review the benefits of each one of these \nclasses and then discuss some implications for the evaluation of \nbig data analytics systems. \nSince the establishment of standardized benchmarks as a “gold \nstandard” for performance evaluation of database systems in the \nearly 90’s , experimental results reported in research papers often \nreuse data sets and queries from well-known benchmarks , like \nTPC-H, TPC-C [TPC13 ], and XMLGen [ XML13 ]. This practice \nis justified by two main factors. First, the synthetic data used by \nstandardized or public benchmarks typically adheres to a short \ntextual specification that is well-known in the database \ncommunity . Reusing data sets from such benchmark s therefore \nmakes the data properties and their impact on the evaluated tasks \nmore comprehensible and increases the trust in the reported \nexperiment result s. Second , well-known benchmarks typically \nprovide open-source tools for data and workload generation , \nwhich can be adapted and used by third parties relatively easy . \nThis reduces the overall effort required to prepare and execute \nPermission to make digital or hard copies of all or part of this work for \npersonal or classroom use is granted without fee prov ided that copies are \nnot made or distributed for profit or commercial advantage and that \ncopies bear this notice and the full citation on the first page. To copy \notherwise, or republish, to post on servers or to redistribute to lists, \nrequires prior specif ic permission and/or a fee. \nDBTEST ’13, June 24, 201 3, New York City, NY, USA \nCopyright 2 013 ACM 1 -58113 -000-0/00/0010 …$15 .00. \n \n“proof -of-concept” experiments and allows researchers to spend \nmore time working on the actual prototype s rather than the tooling \nto evaluate them . \nAn alternative approach that sometimes is preferred for \nspecialized experimental studies is to define an d implement a \ncustom data generator tailored towards the requirements of the \nconcrete experiment s at hand . If the experiments are recognized as \nrelevant by the database community, the data and tasks described \nin the original research are often reused by other authors in \nfollow -up work. For example, Pavlo et al. followed this approach \nin their comparison of approaches for large -scale data analytics \n[PPR+ 09] and implemented a synthetic generator that generates a \ncollection of linked HTML documents and associated data (e.g., \nuser traffic , PageRank ). The data generator and the tasks have \nsince then been used in several other papers dealing with large -\nscale data analytics systems [ DQJ+10 , JOS+10 ]. For graph data, \nthe Kronecker multiplication approach suggested by Leskovec et \nal. [LC K+05] offers a simple algorithm for synthetic generation of \nunlabeled graphs with real world characteristics ( e.g., shrinking \ndiameter, skewed degree distribution) . Due to the lack of \npublic ally available real-world graph s in the terabyte range , \nKronecker graphs are often featured in the evaluation sections of \nseveral graph -mining papers over the past few years [ KTF 09, \nKTA +11]. \nPrincipally , the main issue with both classes is the inherent \nsimplicity in the statistical structure of the generated data. In the \nfirst case , this simplicity is driven by the need for concise and \nunderstandable specification for standardized benchmarks. In the \nsecond case, the main hindering factor is the complexity \nintroduced in the data generation programs by the need for \ncorrelated data and the amount of resources that researchers are \nwilling to invest in their development . \n \nFigure 1: Simplified Retail Database Schema \nIn reference to the characteristic s of new big data analysis systems \npresented in Section 1 , the use of oversimplified synthetic data \ncreates a subtle pitfall that may impact the relevance of research \nresults for real -world applications . The reason for this is that per \ndefinition such systems must work in a distributed execution \nenvironment (cluster or cloud), and also must use some form of \ndata-parallelism in order to ensure scale-out . These design \ndecisions are highly sensitive to data skew, which often is present \nin many target application domains “a priori” and potentially \nchanges over time. To illustrate the problem, consider the retail \ndatabase schema depicted on Figure 1 and a use -case, where the \nbenchmarks or experimental setup models an application that \nwants to compute the top -k most purchased items per product \ncategory. Since some product categories are n aturally more in demand than others, introducing a skew over the product category \ndistribution in the joined LINEITEM -PRODUCT view is critical \nto the relevance of the generated data. As most systems will \nprocess each product category g roup in parallel, skew will \nobviously influence system performance for this particular task. \nMoreover, for an online computation of the same counts in a streaming setting, the degree of skew will depend on the time of \nthe current window ( e.g., in the U.S. shopping peaks between \nThanksgiving Day & Christmas and attains a maximum on “Black \nFriday ”). In this case, assuming an evenly distributed load across \ntime is an oversimplif ication that can influence the relevance of \nthe experimental results for real -world applications. \nWith the advent of big data co\n mes the requirement to quickly \ngenerate huge data sets. This is particularly a challenge when \ngenerating data sets with key/foreign -key relationships or other \ncomplex correlations across tables. Using specialized random number generators with seed skipping allows for doing so in parallel without having to communicate data generated on one node of a shared -nothing cluster to another [RFS+10 , FPR12, \nASP+1 1, ATM12], resulting in toolkits such as PDGF [PDG13] \nor Myriad [Myr13] . Both toolkits provide a set of domain-specific \nprimitives for data generation that facilitate the transparent use of \nseed-skip PRNGs and complementary technique s for scalable \ngeneration of complex data . \n3. GENERATING REALISTIC DATA SETS \nThe advances in new methods for scalable generation of realistic \ndata highlight an important practical question: “If the data \ngenerator program can be expressed in terms of a small set of \nspecial primitives, then to which extent and in which scenarios \ncan the specification process itself be executed automatically ?” A \nnaïve general approach is based on the analysis of empirical \nobservations in the modeled domain and the subsequent synthesis \nof a data generator specification from these observations . In \nbusiness scenarios , however, the analysis is often done in the \ncontext of a reference dataset that represents a ground truth for the \nderived data generator. This section sketches our vision for an \nintegrated framework for such usage scenarios . We propose an \nextensible architecture with clean separation between the data \ngeneration primitives and the methods and techniques used to \nextract relevant features from the ground truth data set. \nA large problem for benchmarking and testing of big data system s \nis the lack of realistic data sets. Many synthetic data sets follow \nsimplistic assumptions ( e.g., few correlations, most ly uniform \ndistributions, over simplified schema ) that are not re presentative \nfor real-world data . A promising, generalizable, and more \neffective way is to automatically extract the domain information \nfrom a ground truth data set , which is often available in practice. \nFigure 2 illustrates our envisioned pipeline. The domain \ninformation is first extracted from the reference database in the form of domain constraints , which can be either hard (e.g., foreign \nkeys, unique keys, and other functional dependencies ) or soft \n(e.g., local statistical models) . The obtained structural, semantic, \nand statistical information is then unified into a n intermediate \nmodel representing the schema information with ann otated \nconstraints . A final synthesis pass transforms the intermediate \nrepresentation into a data generator specification for a specific \ntarget environment like the Myriad . This specification is then used \nto create a concrete data generator instance that is able to mimic \nthe original data set. \n\n \n \nFigure 2: A Pipeline for the A nalysis & Synthesis of D ata Generators \nWe note that in the first step of this process, the circumstances in \nwhich the analysis is performed will influence its depth and \nconsequently the quality of the collected domain information. \nIf the reference database cannot be accessed directly and the \ndomain information is available only in a derived form , such as in \na database catalog, the analysis must be performed indirectly and \ncan only extract the available catalog information . This \ninformation commonly consists of attribute value statistics ( e.g., \nfrequen cy values, histograms, number of distinct va lues, and \nnumber of NULLs) , schema information , and integrity constraints \n(e.g., referential integrity , primary key s, and unique constraints as \nwell as other constraints represent ing domain invariants ). \nAlternativ ely, if the reference database is available directly , \nadvanced profiling methods could be leveraged to obtain information beyond the catalog in order to capture a more accurate domain model . This approach will require us to \ndetermine additional characterizations of the dataset to be generated (e.g., advanced multivariate statistics [SHM+06] and \nsoft constraints [IMH+04, BH03, SBH+06]) on the data with \nscalable methods (see [HIL+09] for an overview of statistical methods, and [Nau13] for an overview of data profiling). Using \nthese techniques will allow for determining the essential characteristics of real -world data set s and correspondingly will \nenable one to scale up or down synthetic clones . \nThe integration of data profiling and data generation workflows is \nrelevant in the era of big data for a number of reasons . First, many \ninstitutions publish their data sets in order to let others perform \ntheir experiments on them. However, database sizes are becoming larger and larger . Conseque ntly, it is becom ing increasingly \ndifficult to transfer these huge data sets to the person wishing to \nuse them due to network and bandwidth constraints . Therefore, it \nis desirable to have a compact specification of the data sets, i.e. , a \nsynopsis or profile from which one can automatically generate a \ndata generator specification and thus the dataset . Second, data \nprofiling will increase the relevance tests or benchmarks. Huppler \n[Hup09] describes five key aspects for a good benchmark , namely \na good b enchmark has to be relevant, repeatable, fair, verifiable, \nand economical. Section 2 mainly addressed the latter one, while \ndata profiling will help to improv e the relevance. \nCurrently, w e are develo ping a prototype called Oligos [Oli13] \nthat adheres to our aforementioned vision. The initial version of Oligos can generate data generator specifications for the Myriad Toolkit [Myr13] from the system catalog of a database system. Our long -term vision is provide a modular API that will allow \nlearning advanced statistics and correlation information, in order \nto ge nerate even more realistic data sets. \n4. AN APPLICATION: REGRESSION \nTESTING OF BIG DATA SYSTEMS \nAn important part of the maintenance lifecycle of co mmercial big \ndata system s as well as general data management system s is \ndevoted to the diagnosis of performance regressions observed by customers in a production setting. When trying to reproduce the \nproblematic behavior in a test environment, database system \ndevelopers often face the problem of missing data – even though \nthe database schema and the problematic queries can be provided by the customer as part of the regression report, the actual \ndatabase instance typically cannot be obtained ( e.g., due to \nprivacy restrictions). T ypically , what is available is the database \ncatalog, which contains a statistical approximation of the \nreference database in the form of value distributions, cardinalities , \nand histograms on columns or column groups. As a fallback solution, developers currently trick the optimizer of a test database \nby feeding customer catalog data in order to obtain the query \naccess paths of the actual production system. As the underlying \ndata is missing and the database catalog is usually lacking crucial \ninformation (e.g., on multivariate distributions ) synthetic data sets \ngenerated in the lab are not representative. Thus, information on \nhow the query access paths perform requires further assistance \nand feedback from the client. The lack of a complete and \nrepresentative regression database therefore slows down the maintenance process and causes additional costs . The methods for \ndata generation based on data and workload characterization as \nenvisioned in Oligos and Myriad would offer a remedy to this \nproblem. \n5. OPEN ISSUES AND CONCLUSIONS \nWe have given an overview of issues in big data benchmarking \nand testing, with a strong focus on data generation. We believe that efficiently generating a huge, realistic data set is an important \nprerequisite for the advancement, evaluation, and fair comparison \nof big data systems. Myriad [Myr13], PDGF [DPG13] , and Oligos \n[Oli13 ] are a first step in this direction. However, in the context of \nbig data generation and benchmarking, a large number of \nchallenges remain open. \nHowever, in the context of big data generation and benchmarking, \na l\narge number of challenges remain open. For realistic data \ngeneration from a given reference dataset the challenges exist \nboth in the analysis and the synthesis phase. \nDuring the analysis phase, a combination of data characterization \nand profiling methods can be identified and applied in order to increase the quality of the dom ain information that can be inferred \ndirectly from the reference database. Such methods will allow to \nefficiently determine multi-key dependencies, in particular \nreferential integrity, as well as to profile data with complex structure (e.g., text, graphs, NF² and hierarchical data). In order to preserve privacy when conducting data profiling, data obfuscation \nmethods may as well be required. [Nau13] lists further challenges \nin the area of data profiling. \nInferred schema information and constraints must be then unified \ninto an intermediate representation (IR) in the synthesis phase. \nTwo problems exist in this context. First, in order to facilitate the subsequent translation of the IR into a data generator specification, the IR should lend itself to the features and \nprimitives common to the underlying data -generation engines. \nSecond, the unification process should determine and handle \ninconsistencies in the domain information collected in the analysis \nphase. Recently, Arasu [AKL11] and Torlak [Tor12] suggested \ntwo different constraint -based languages for data generator \nspecification that can serve as a starting point for the development of a suitable IR and synthesis algorithm. For both languages, the authors give sufficiency conditions for the existence of a data set \nfulfilling the input constraints and provide approximate \nalgorithms to find such an instance. The approach presented in \n[Tor12] uses a mix of hard (dimension or integrity) and soft \n(statistical) c onstraints and is restricted to dimension models, \nwhereas [AKL11] works on general relational models and relies \nsolely on soft (cardinality) constraints (hard constraints are \nrepresented implicitly as a special form of soft constraints). As the target lang uage in our setting is likely to include primitives that \ndirectly enforce certain types of hard constraints (e.g. unique keys, \nforeign keys), we believe that a distinction between soft and hard \nconstraints in the IR is a more promising approach. \nAnother big open area is the provisioning of workloads. \nTraditional benchmarks focus on simple workloads that \nessentially follow the relational algebra or an NF² algebra/ \nXQuery. For evaluating and testing big data analytics systems, we \nwill require m ore complex workloads that involve machine \nlearning algorithms, information extraction, and graph analysis/mining. The lack of a standardized data analysis language currently is a big obstacle for arriving at realistic, comparable, and universally useful w orkload specifications. Ideally, u ntil a \nstandardized declarative language is available use-case \nrepositories may be a first step in this direction. \n6. ACKNOWLEDGMENTS \nWe thank Berni Schiefer from IBM and Tillmann Rabl from the \nUniversity of Toronto for interesting discussions. Our \ninvestigations were funded by a CAS grant from IBM, the ICT \nLabs of the European Institute of Technology as well as the DFG \n(German National Science Foundation) via the Stratosphere Collabor ative Research Unit. \n7. REFERENCES \n[ABE+10] A. Alexandrov, D. Battré, S. Ewen, M. Heimel, F. \nHueske, O. Kao, V. Markl, E. Nijkamp, D. Warneke: Massively Parallel Data Analysis with PACTs on Nephele. PVLDB Vol. 3, No. 2, pp. 1625– 1628 \n(2010) \n[ABG+12] S. Als ubaiee, A. Behm, R. Grover, R. Vernica, V. \nBorkar, M. J. Carey, C. Li: ASTERIX: Scalable \nWarehouse -Style Web Data Integration. In \nProceedings of the Ninth International Workshop on \nInformation Integration on the Web, Article 2, ACM, (2012) \n[AKL11] A. Arasu, R. Kaushik, J. Li: Data Generation using \nDeclarative Constraints. Proceeding of the SIGMOD Conference, pp. 685-696 (2011) \n [ASP+1 1] A. Alexandrov, B. Schiefer, J. Poelman, S. Ewen, T. \nBodner, V. Markl: Myriad - Parallel Data Generation \non Shared-Not hing Architectures , In Proc. ASBD, pp. \n30-33 (2011) \n[ATM12] A. Ale xandrov, K. Tzoumas, V. Markl: Myriad: \nScalable and Expressive Data Generation , In Proc. \nVLDB(5) pp. 1890- 1893 ( 2012) \n[BH03] P. Brown, P. Haas: BHUNT: Automatic Discovery of \nFuzzy Algebraic Constraints in Relational Data. \nVLDB 2003: 668-679 [BEG+11] K. S. Beyer, V. Ercegovac, R. Gemulla, A. Balmin, M. Eltabakh, C.-C. Kanne, E. J. Shekita: Jaql: A scripting language for large scale semistructured data \nanalysis. In Proc. of VLDB Conference. (20 11) \n[DG04] J. Dean, S. Ghemawat: MapReduce: simplified data processing on large clusters , In OSDI, pp. 137 -150 \n(2004) \n[FML+ 12] F. Färber, N . May, W . Lehner, P . Große, I . Müller, H . \nRauhe, J . Dees: The SAP HANA Database -- An \nArchitecture Overview. IEEE Data Eng. Bull. 35(1): \n28-33 (2012) \n[FPR12] M. Frank, M. Poess, T. Rabl: Efficient update data generation for DBMS benchmarks. ICPE 2012: 169-180 \n[GSA+11 ] R. Greenwald, R. Stackowiak, M. Alam, M. Bhuller.. \nAchieving extreme performance with Oracle Exadata. McGraw -Hill Osborne Media (2011) \n[Had13] \nhttp://hadoop.apache.org/ , last accessed 05 -10-2013 \n[HIL+09] P. J. Haas, I. Ilyas, G. Lohman, V. Markl: Disco vering \nand Exploiting Statistical Properties for Query Optimization in Relational Databases: A Survey. Statistical Analysis and Data Mining 1(4): 223 -250 \n(2009) \n[Hup93 ] K. Huppler: The Art of Building a Good Benchmark. \nTPCTC 2009: 18- 30 (2009) \n[IMH+04] I. Ilyas, V . Markl, P . Haas, P . Brown, A . Aboulnaga: \nCORDS: Automatic Discovery of Cor relations and \nSoft Functional Dependencies. SIGMOD Conference 2004: 647-658 \n[Imp13] \nhttps://github.com/cloudera/impala , last accessed 05 -\n10-2013 \n[LBG+12] Y. Low, D. Bickson , J. Gonzalez, C. Guestrin , A. \nKyrola , J. M. Hellerstein: DistributedGraphLab: A \nframework for machine learning and data mining in \nthe cloud. Proceedings of the VLDB Endowment, \n5(8), pp. 716-727 (2012) \n[Mah13] Mahout: http://mahout.apache.org/ , last accessed 04 -\n21-2013 \n[Myr13] https://github.com/TU -Berlin-DIMA/myriad -\ntoolkit/wiki , last accessed 05 -10-2013 \n[Nau13] http://www.hpi.uni-\npotsdam.de/naumann/publications/publications_by_ty\npe/year/2013/2276/Nau13.html , SIGMOD Record \n(2013) \n[Oli13] https://github.com/TU -Berlin-DIMA/myriad -\ntoolkit/wiki/Using -Oligos -Guide , last accessed 05 -10-\n2013 \n[ORS+08] C. Olston, B. Reed, U. Srivastava, R. Kumar, A. \nTomkins: Pig Latin: A Not-So -Foreign Language for \nData Processing. Proceedings of the SIGMOD \nConference (SIGMOD), pp. 1099 -1110, (2008) \n[PDG13] http://www.paralleldatageneration.org/drupal6/ , last \naccessed 05 -10-2013 \n[RFS+10] T. Rabl, M . Frank, H . Sergieh, H . Kosch: A Data \nGenerator for Cloud-Scale Benchmarking. TPCTC 2010: 41- 56 \n[SBH+06] Y. Sismanis, P. Brown, P. Haas, B. Reinwald: \nGORDIAN: Efficient and Scalable Discovery of Composite Keys. VLDB 2006: 691-702 \n[SHM+06] U. Srivastava, P. Haas, V . Markl, M . Kutsch, T .Tran: \nISOMER: Consistent Histogram Construction Using \nQuery Feedback. ICDE (2006) \n[Spa 13] \nhttp://spark -project.org/ , last accessed 05 -10-2013 \n[Str13] http://www.stratosphere.eu/ , last accessed 05 -10-2013 \n[Tor12] E. Torlak: Scalable test data generation from multidimensional models . Proceedings of the ACM \nSIGSOFT 20th International Symposium on the \nFoundations of S oftware Engineering (2012) \n[TPC13] \nhttp://www.tpc.org , last accessed 05 -10-2013 \n[TSJ+09] A. Thusoo, J. S.Sarma, N. Jain, Z. Shao, P.Chakka, S. Anthony, H. Liu, P. Wyckoff, R. Murthy: Hive - A \nWarehousing Solution Over a M ap-Reduce \nFramework. PVLDB 2(2), pp. 1626- 1629 (2009) \n[XML13] \nhttp://www.xml-benchmark.org/ , last accessed 05 -10-\n2013 [PPR+09] A. Pavlo, E . Paulson, A . Rasin, D. Abadi, D . DeWitt, \nS. Madden, M . Stonebraker: A comparison of \napproaches to large-scale data analysis. SIGMOD \nConference 2009: 165 -178 \n[DQJ+10] J. Dittrich, J . Quiané -Ruiz, A . Jindal, Y . Kargin, V . \nSetty, J . Schad: Hadoop++: Making a Yellow \nElephant Run Like a Cheetah (Without It Even \nNoticing). PVLDB 3(1): 518- 529 (2010) \n[JOS+10] D. Jiang, B . C. Ooi, L. Shi, S . Wu: The Performance \nof MapReduce: An In-depth Study. PVLDB 3(1):472-483 (2010) \n[LCK+05] J. Leskovec, D . Chakrabarti, J . Kleinberg, C . \nFaloutsos: Realistic, Mathematically Tractable Graph Gene ration and Evolution, Using Kronecker \nMultiplication. PKDD 2005: 133 -145 \n[KTF09] U. Kang, C .E. Tsourakakis, C . Faloutsos: PEGASUS: \nA Peta -Scale Graph Mining System. ICDM 2009: \n229-238 \n[KTA+11] U. Kang, C.E. Tsourakakis, A.P. Appel, C . Faloutsos, \nJ. Leskovec: HADI: Mining Radii of Large Graphs. \nTKDD 5(2): 8 (2011) \n ",
"metadata": {
"filename": "alexandrov2013.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\alexandrov2013.pdf",
- "file_size": 123038,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:33.788781",
- "content_length": 27035
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\alexandrov2013.pdf",
+ "size": 123038,
+ "source": "docs_to_import"
+ },
+ "id": "1dbda3bb-553e-4f98-9333-3464241cfcd5"
},
- "c1e3eb5c-c9e9-4b54-91de-8f4b3dfab991": {
- "id": "c1e3eb5c-c9e9-4b54-91de-8f4b3dfab991",
- "content": "[Página 1]\nComputers in Biology and Medicine 163 (2023) 107166\nAvailable online 9 June 2023\n0010-4825/© 2023 Elsevier Ltd. All rights reserved.An enhanced grey wolf optimizer boosted machine learning prediction \nmodel for patient-flow prediction \nXiang Zhanga, Bin Lub, Lyuzheng Zhangc, Zhifang Pand, Minjie Liaoa, Huihui Shena, \nLi Zhange, Lei Liuf, Zuxiang Lig,*, YiPao Huh,**, Zhihong Gaoi,*** \naWenzhou Data Management and Development Group Co.,Ltd, Wenzhou, Zhejiang, 325000, China \nbWenzhou City Bureau of Justice, Wenzhou, Zhejiang, 325000, China \ncB-soft Co.,Ltd., B-soft Wisdom Building, No.92 Yueda Lane, Binjiang District, Hangzhou, 310052, China \ndThe First Affiliated Hospital of Wenzhou Medical University, Wenzhou, 325000, China \neWenzhou Hongsheng Intellectual Property Agency (General Partnership), Wenzhou, Zhejiang, 325000, China \nfCollege of Computer Science, Sichuan University, Chengdu, Sichuan, 610065, China \ngOrganization Department of the Party Committee, Wenzhou University, Wenzhou, 325000, China \nhWenzhou Health Commission, Wenzhou, Zhejiang, 325000, China \niZhejiang Engineering Research Center of Intelligent Medicine, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, 325000, China \nARTICLE INFO \nKeywords: \nPatient-flow prediction \nSupport vector regression \nMachine learning \nMeta-heuristic \nSwarm-intelligence ABSTRACT \nLarge and medium-sized general hospitals have adopted artificial intelligence big data systems to optimize the \nmanagement of medical resources to improve the quality of hospital outpatient services and decrease patient \nwait times in recent years as a result of the development of medical information technology and the rise of big \nmedical data. However, owing to the impact of several elements, including the physical environment, patient, \nand physician behaviours, the real optimum treatment effect does not meet expectations. In order to promote \norderly patient access, this work provides a patient-flow prediction model that takes into account shifting dy-\nnamics and objective rules of patient-flow to handle this issue and forecast patients ’ medical requirements. First, \nwe propose a high-performance optimization method (SRXGWO) and integrate the Sobol sequence, Cauchy \nrandom replacement strategy, and directional mutation mechanism into the grey wolf optimization (GWO) al-\ngorithm. The patient-flow prediction model (SRXGWO-SVR) is then proposed using SRXGWO to optimize the \nparameters of support vector regression (SVR). Twelve high-performance algorithms are examined in the \nbenchmark function experiments ’ ablation and peer algorithm comparison tests, which are intended to validate \nSRXGWO ’s optimization performance. In order to forecast independently in the patient-flow prediction trials, the \ndata set is split into training and test sets. The findings demonstrated that SRXGWO-SVR outperformed the other \nseven peer models in terms of prediction accuracy and error. As a result, SRXGWO-SVR is anticipated to be a \nreliable and efficient patient-flow forecast system that may help hospitals manage medical resources as effec-\ntively as possible. \n1.Introduction \nPrimary medical care is the guarantee of people ’s survival and \ndevelopment. With the continuous development of economic, cultural, \nand social construction, people ’s demand for medical resources is much higher. Their awareness of medical and health care also increases re-\nquirements for the current medical industry. Since the medical service \nsystem is complex, it is not only influenced by factors such as local de-\nmographic characteristics, socio-economic conditions, natural environ -\nmental conditions, medical hardware, software facilities, and patient \n*Corresponding author. \n**Corresponding author. \n***Corresponding author. \nE-mail addresses: zhxan@126.com (X. Zhang), wzlubin@139.com (B. Lu), 66199293@qq.com (L. Zhang), panzhifang@wmu.edu.cn (Z. Pan), 1829820@qq.com \n(M. Liao), ylvias7@126.com (H. Shen), 101744491@qq.com (L. Zhang), liulei.cx@gmail.com (L. Liu), lizuxiang@wzu.edu.cn (Z. Li), huyipao@outlook.com (Y. Hu), \ngzh@wzhospital.cn (Z. Gao). \nContents lists available at ScienceDirect \nComputers in Biology and Medicine \nu{�~zkw! s{yo|kro>! ÐÐÐ1ow�o �to~1m{y2w{m k�o2m{y|lt{ yon!\nhttps://doi.org/10.1016/j.compbiomed.2023.107166 \nReceived 10 March 2023; Received in revised form 25 May 2023; Accepted 8 June 2023\n\n[Página 2]\nComputers in Biology and Medicine 163 (2023) 107166\n2and doctor behaviors [1]. But there are also various interactions and \npositive and negative feedback between these influencing factors, which \nmay result in the longer the waiting time in the hospital, the more \nattractive the patients are, or the regular changes in the hospital waiting \nqueue, etc. Self-organized regularities and Emergent behavior make it \ndifficult for hospitals to implement optimal outpatient management \nmeasures and cause the actual use of available resources not to match \nthe expected results [2]. Therefore, to improve the efficiency of existing \nmedical resources, improve the quality of hospital outpatient services, \nshorten patient waiting queues and waiting times, it is crucial to un-\nderstand the changing dynamics and objective patterns of patient-flow \nto provide a basis for dynamic adjustment of physician consultation \nplans and to achieve orderly and effective patient control. \nIn recent years, the advancement of medical informatization and the \nrise of big medical data has allowed studying patient-flow prediction \nbased on big data mining. Researchers have conducted some research in \nthe analysis of patient-flow change patterns, analysis of patient-flow \ninfluencing factors, and patient-flow prediction. Li et al. [3] proposed \na time series patient-flow prediction method based on XGBoost, a sup-\nport vector machine (SVM), to solve the problem of planning and allo-\ncation of healthcare resources by government and hospital management. \nNikakhtar et al. [4] proposed a patient visit prediction model based on \neigendistance and mesocentricity that can help healthcare managers and \ndecision-makers predict the trend of infectious patient-flow. Sharafat \net al. [5] proposed an emergency room patient-flow prediction model \n(PatientFlowNet) based on a deep learning framework, including pre-\ndicting arrival, treatment, and discharge rates. The results show that \nPatientFlowNet has higher accuracy and lower average absolute error \nthan the benchmark algorithm. Tavakoli et al. [6] proposed a seasonal \nautoregressive integrated moving average (SARIMA) model for \npatient-flow prediction of the current epidemic of neocrown pneumonia \ndisease, effectively predicting the number of patients’ visits to Thai \nhospitals in the next 30. According to the current research status, it is \neasy to find that more and more researchers are using machine learning \ntechniques to predict the number of patient visits in hospitals. However, \nsince most of the prediction models use a monadic time-series feature \nprediction method and the changes of patient-flow are affected by a \nvariety of complex factors and do not have obvious linear characteris -\ntics, resulting in the accuracy of the models is not high. On the other \nhand, it is limited by the defects of the classification predictor itself, \nwhich leads to large prediction bias of prediction models based on SVM \nand other prediction models. Therefore, how to improve the accuracy \nand reduce the error of patient-flow prediction models is a major chal-\nlenge in current medical resource scheduling research. \nAs a novel optimization method with strong robustness and flexi-\nbility, the swarm intelligence optimization algorithm is widely used in \npredictive optimization problems. The swarm intelligence optimization \nalgorithm is a stochastic optimization algorithm abstracted by simu-\nlating the collaborative behavior of animals, insects, and other organ -\nisms. The current well-known algorithms are, grey wolf optimization \n(GWO) [7], bat-inspired algorithm (BA) [8], different evolution (DE) \n[9], sine cosine algorithm (SCA) [10], salp swarm algorithm (SSA) [11], \nwhale optimizer (WOA) [12], moth-flame optimization (MFO) [13], \nparticle swarm optimization (PSO) [14], hunger games search (HGS) \n[15], Harris hawks optimization (HHO) [16], rime optimization algo-\nrithm (RIME) [17], colony predation algorithm (CPA) [18], Runge Kutta \noptimizer (RUN) [19], weighted mean of vectors (INFO) [20], slime \nmould algorithm (SMA) [21,22], opposition-based SCA (OBSCA) [23], \nmodified SCA (m_SCA) [24], boosted GWO (OBLGWO) [25], A-C para-\nmetric WOA (ACWOA) [26], fruit fly optimizer (FOA) with \nmulti-population outpost mechanism (MOFOA) [27], SCA with differ -\nential evolution (SCADE) [28], and so on. They also have been applied to \nsolve many problems such as bankruptcy prediction [29], feature se-\nlection [30–34], economic emission dispatch [35], multi-objective \noptimization [36], global optimization [37,38], dynamic \nmulti-objective optimization [39], numerical optimization [40–42], scheduling optimization [43,44], feed-forward neural networks [45], \nmedical image segmentation [46–48], feature selection [49,50], per-\nformance optimization [51,52], identification of pulmonary hyperten -\nsion animal [53], constrained multi-objective optimization [54], and \nlarge-scale complex optimization [55]. \nMore and more researchers are considering optimizing models using \nswarm intelligence optimization methods to improve the accuracy of \nprediction methods. Chou et al. [56] proposed a swarm intelligence \nalgorithm-based support vector machine prediction model (SFALSSVM) \nusing the smart firefly algorithm (SFA) to optimize the parameters of the \nleast squares support vector regression (SVR) and successfully applied it \nto several geotechnical engineering problems. Kaushik et al. [57] pro-\nposed a binary swarm intelligence algorithm by combining the firefly \nalgorithm and bat algorithm with a wavelet neural network (WNN) and \noffered a prediction model for software development effort (SDEE), \nwhich has high prediction accuracy. Mehraein et al. [58] proposed a \nCatBoost (CB) prediction model based on a swarm intelligence algorithm \nfor predicting the monthly flow of satellite precipitation data and \ndemonstrated a significant reduction in the root mean square error \n(RMSE) of the proposed CB compared with an artificial neural network \n(ANN). Zhu et al. [59] combined the WOA and the simulated annealing \nalgorithm (SA) to optimize the kernel extreme learning machine \n(KELM). They proposed an enhanced search-based prediction algorithm \n(EMWS) that effectively addresses defect prediction in software \nmodules. \nZhou et al. [60] improved the Firefly algorithm (FA) by incorpo -\nrating chaotic mapping, adaptive inertia weights, and Levy flight for \naccurate prediction of reinforcement tensile loads for assessing the in-\nternal stability of geosynthetic reinforced soil (GRS) structures. They \nused the improved FA to optimize the hyperparameters of the \nleast-squares SVR model. The improved SVR model had excellent ac-\ncuracy with an average absolute percentage error of less than 10%. Ma \net al. [61] proposed an SVR prediction model integrated with k-fold \ncross-validation (CV) and used an artificial bee colony (ABC) algorithm \nand genetic algorithm (GA) to optimize the hyperparameters of the \nmodel. The results showed that the hybrid approach can be used to \ndetermine the optimal hyperparameters and present statistical signifi -\ncance. Huang et al. [62] proposed a swarm intelligence algorithm (DFP) \nintegrating floral pollination algorithm (FPA) and differential evolution \n(DE) and an algorithmic model for predicting the groutability of cement \npaste in combination with SVR. Luo et al. proposed a hybrid prediction \nmodel (LS-SVMR) using a coupled simulated annealing (CSA) algorithm \nto optimize the hyperparameter selection of SVR, which effectively \nimplemented the lateral strength prediction of reinforced concrete (RC) \ncolumns. \nBased on the above improvement methods for prediction models, it \ncan be found that swarm intelligence optimization algorithms can \neffectively help prediction models find optimal hyperparameters, and \nSVR is applied very frequently in many models. However, due to the \nvariety of swarm intelligence algorithms, each algorithm has defects, \nsuch as low convergence accuracy, slow search speed, and easy falling \ninto local optimality. Therefore, in this paper, to accurately predict the \nnumber of patients and reasonably schedule medical resources, an SVR \nprediction model based on improved GWO is proposed using the GWO \nalgorithm with high exploitation capability combined with SVR pre-\ndiction methods. First, to give full play to the exploitation advantages of \nGWO and overcome the shortcomings of GWO in the search process as \nmuch as possible, the following three methods are used for improve -\nment: (1) To address the problem of narrow coverage of the initialized \nsearch agent of GWO, the original random initialization method is used \ninstead of Sobol sequence to expand the distribution of the initial so-\nlution. (2) To address the problem of too little information exchange \namong GWO search agents, a directional mutation mechanism is used to \nincrease the interactivity of solutions, improving the algorithm’s search \nefficiency. (3) To address the problem of imbalance between GWO \nsearch and exploitation, a Cauchy random replacement strategy is added X. Zhang et al.\n\n[Página 3]\nComputers in Biology and Medicine 163 (2023) 107166\n3to the core update formula to adjust the weights of search and exploi -\ntation of the algorithm in the iterative process. Based on the above ideas, \nSobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism are intro-\nduced into GWO to propose a high-performance GWO variant \n(SRXGWO). Then, to verify the optimization performance of SRXGWO, \nthis paper designs comparative simulation experiments based on the \nclassical IEEE CEC2014 test set and compares SRXGWO with other X \nmethods. The experiments show that the proposed SRXGWO method \nsignificantly improves initialization, search efficiency, and defects of \niterative balance. This paper also analyzes the comparative results using \nthe Wilcoxon signed-rank test [63] and the Friedman test [64]. \nSRXGWO has a higher convergence speed compared with peer algo-\nrithms and accuracy. \nFurther, this paper proposes a multivariate SRXGWO-SVR prediction \nmodel for predicting patient flow by optimizing two hyperparameters of \nSVR using high-performance SRXGWO. To validate the real prediction \nability of the SRXGWO-SVR model, the prediction results of the model \nare presented in detail using real clinical data sets and divided into \ntraining and test sets. Further, the SRXGWO-SVR model based on \nSRXGWO, the GWO-SVR model based on GWO, and the original SVR \nmodel are compared in this paper, and the experimental results also \ndemonstrate that the SRXGWO-SVR can effectively outperform the two \noriginal models without improvement. Finally, this paper also compares \nthe SRXGWO-SVR model with well-known prediction models such as \nRadial basis function networks, convolutional neural networks, etc. R- \nsquared (R2), root mean squared error (RMSE), and mean absolute error \n(MAE) are used for validation and confirm that SRXGWO-SVR is more \nadvantageous in predicting hospital patient-flow. The data set used in \nthis paper is the attendance statistics of Wenzhou Medical University \nHospital in China, which serves a radius of nearly 30 million people and \nhas an annual outpatient volume of 5.3 million. Due to the large volume \nof data, the latest data from January 2022 to September 2022 is selected, \nwith a sample size of 240 items. The main contributions of this paper are \nas follows. \n1. Sobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism are \nintroduced into GWO to propose a high-performance algorithm \nSRXGWO. The strategies and mechanisms employed in this paper can \nprovide a valid reference for the field of evolutionary computation. \n2.We designed experiments comparing SRXGWO with 12 similar al-\ngorithms to verify the algorithm ’s improvement ideas and optimi -\nzation performance. Experiments can effectively demonstrate the \nperformance of SRXGWO ’s benchmark functions and provide illus-\ntrations for their specific applications. \n3. SRXGWO is used to optimize the hyperparameters of SVR, and the \nSRXGWO-SVR multivariate prediction model is proposed and suc-\ncessfully applied to predict patient flow. The proposed model can \neffectively predict patient flow and provide useful suggestions for \nhospital management. \n4. We designed a comparison experiment between SRXGWO-SVR and \neight similar prediction models to verify the effectiveness of the \nimprovement and the accuracy of the prediction. The experiments \nillustrate that the proposed model has great potential for predicting \nother time series problems. \nThe rest of this paper is organized as follows. Section 2 describes the \nprediction dataset, the original GWO, and SVR. In Section 3, SRXGWO is \nproposed based on three improvement strategies, and the SRXGWO-SVR \nmodel is proposed in conjunction with SVR. In Section 4, benchmark \nfunction comparison experiments and simulation prediction comparison \nexperiments are designed. Finally, Section 5 summarizes the work of this \npaper and illustrates further research directions. 2.Materials and methods \nThis section introduces the swarm intelligence optimization algo-\nrithm GWO and the regression prediction model SVR used in this study. \n2.1. Description of GWO algorithm \nIn the GWO algorithm, grey wolf individuals are divided into four \nclasses: α、β、δ and ω. α is mainly responsible for participating in the \ndecision-making and management of the pack; ω is for other grey wolf \nindividuals; β and δ are for grey wolf individuals with the second highest \nadaptation level to α. The GWO algorithm focuses on three behaviors: \nencirclement behavior, hunting behavior, and attack behavior. \n1. Encirclement behavior \nThe first stage of prey predation by grey wolves is to encircle the \nprey, and the mathematical model can be described by Eq. (1) and Eq. \n(2). \nD↗⃦⃦⃦⃦C↗⋅X↗\np
t\u0000X↗
t⃦⃦⃦⃦(1) \nX↗
t1X↗\nv
t\u0000A↗⋅D↗(2) \nwhere D↗is the distance between the prey and the wolves; A↗2a⋅r2\u0000\na, C↗2⋅r↗\n2; X↗is the current location of the wolves; t is the number of \ncurrent iterations; X↗\np is the location of the prey; r1 , r2 are random \nnumbers, between 0C1; a∃2C0. \n2. Hunting behavior \nAfter a wolf pack surrounds a prey, it will hunt the surrounding prey. \nIf α is the global optimal solution, β is the global second solution, and δ is \nthe global third solution, then the mathematical model of α, β, and δ \nrepositioning can be described by Eqs. (3)–(5). \nD↗\nα⃦⃦⃦⃦C↗\n1⋅X↗\nα\u0000X↗⃦⃦⃦⃦(3) \nD↗\nβ⃦⃦⃦⃦C↗\n2⋅X↗\nβ\u0000X↗⃦⃦⃦⃦(4) \nD↗\nδ⃦⃦⃦⃦C↗\n2⋅X↗\nδ\u0000X↗⃦⃦⃦⃦(5) \nwhere D↗\nα, D↗\nβ and D↗\nδ denote the approximate distances of α, βCand δ \nfrom X↗, respectively; X↗\nα, X↗\nβ, X↗\nδ denote the position information of α, \nβ, and δ, respectively; C↗\n1, C↗\n2 and C↗\n3 denote the random vectors, \nrespectively. The current solution X↗and the updated solution X↗
t1\ncan be described by Eq. (6)-Eq. (9). \nX↗\n1X↗\nα\u0000A↗\n1⋅(\nD↗\nα)\n(6) \nX↗\n2X↗\nβ\u0000A↗\n2⋅[\nD↗\nβ]\n(7) \nX↗\n3X↗\nδ\u0000A↗\n3⋅(\nD↗\nδ)\n(8) \nX↗′\n
t1[\nX↗\n1X↗\n2X↗\n3][\n3 (9) \nwhere A↗\n1 , A↗\n2 , and A↗\n3 denote random vectors, respectively. X. Zhang et al.\n\n[Página 4]\nComputers in Biology and Medicine 163 (2023) 107166\n43. Attack behavior \nThe final stage of the GWO algorithm is the prey attack phase, which \ncan be achieved by adjusting the parameter A. If †A†≼1, the whole wolf \npack approaches the prey
X∗CY∗and focuses on the prey; if †A†F1, the \nwhole wolf pack moves away from the prey and looks for new prey \nagain. \n2.2. Description of support vector regression \nSupport vector machine (SVM) models are used to classify data by \nmapping the input metric data to a higher dimensional space, then \nconstructing an optimal hyperplane in this higher dimensional space so \nthat the constructed hyperplane has the largest edges to classify the \ninput data. The learning strategy used by the support vector machine is \ninterval maximization, which can be formalized as solving a convex \nquadratic programming problem. \nInstead of the traditional statistical induction followed by deduction, \nthe SVR model constructs a regression function to infer a prediction \nmodel on the training data and then uses the model to make predictions. \nThe objective of SVR modeling is to build a classification surface that \nseparates the two types of samples as well as possible. SVR modeling \naims to minimize the distance between all the sample data and the \nclassification surface. The accuracy of the SVR model is highly depen -\ndent on the kernel function ’s quality and the penalty factor ’s accuracy, \nand the appropriate choice of parameters dramatically improves the \naccuracy of the regression model. When the parameters of the regression \nmodel are not selected appropriately, the regression model will not be \napplicable to solve the actual problem. For the training data, regression \naims to solve the following regression function, as in Eq. (10). \nf
y〈W0y〉b (10) \nThe above equation is 〈w0y〉 is the inner product of w and y. The \nfollowing equation is the constraint to solve the constrained optimiza -\ntion problem: \nMin 1⎡\n2Dw0wFĈm\ni1\u0000\nξiξ∗\ni)\n(11) \nZi\u0000Dw0yiFb≼εξi (12) \nDw0yiF\u0000zib≼εyj\u0000yk (13) \nwhere C represents the penalty factor of the model, the value of C is \npositively related to the complexity of the model, the complexity of the \nmodel increases with the value of C, and the value of C is negatively \nrelated to the computational error of the model, the error of the model \nbecomes smaller as the value of C increases. \nThe solution of the optimization problem is first transformed into the \ncorresponding pairwise problem and, secondly transformed into the \nsolution of the maximum constraint value by introducing the kernel \nfunction. Finally, the regression equation of the model is shown in Eq. \n(14). \nf
ŷm\ni1\u0000\naj\u0000aj)0k\u0000\nyCyj)\nb (14) \n3.The proposed method \nIn this section, three improvement ideas are described, namely, \nSobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism. Finally, the \nproposed SRXGWO is used to optimize the hyperparameters of the SVR \nmodel, and the patient-flow prediction model SRXGWO-SVR is \nproposed. 3.1. Proposed GWO variant \n3.1.1. Sobol sequence-based population initialization \nThe population initialization of the original GWO algorithm is \nrandomly generated, which primarily affects the algorithm ’s perfor -\nmance. In contrast, the Sobol sequence can make the spatial points \nuniformly distributed and generate unlimited samples without pre- \ndetermining the number of samples and storing them. Therefore, this \npaper introduces the Sobol sequence to filter the initialization position \nof the grey wolf population, improve the uniformity and diversity of the \ngrey wolf population, and improve the performance of the original GWO \nalgorithm. \nEach dimension of the Sobol sequence is a Radical inversion with \nbase 2, and each dimension has a different generating matrix C. When C \nis taken as a unit vector, the corresponding Sobol sequence is repre -\nsented as \nN
îM\nk12\u0000kak
i (15) \nwhere i is denoted as a binary number, ak
ion each bit of the number is \narranged as a vector, which is mirrored to the right of the decimal point \nand converted to decimal, resulting in a one-dimensional Sobol \nsequence Xi⊔N
1CN
2…CN
iCi∃N⊓, and a multi-dimensional Sobol \nsequence is obtained by multiplying the generating matrix C of each \ndimension. The Sobol sequence is used to uniformly distribute n points \nwithin the threshold of the target parameter search as the initialized \npopulation space location. The first three solutions are defined as α, β, \nand δ wolves, respectively. To confirm the effectiveness of Sobol \nsequence-based population initialization, Ablation experiments of \nSRXGWO are designed in Section 4.1.2 , where SGWO is the improved \nGWO using this strategy alone. \n3.1.2. Cauchy random replacement strategy \nIn the iterative process, the position update of GWO is conservative. \nOn the one hand, such an update is beneficial to the exploitation of the \nalgorithm. Still, on the other hand, it may cause the algorithm to have a \npoor quality of the search solution and fall into local optimum when \ndealing with multi-peaked problems. Therefore, in this paper, to solve \nthis problem, the Cauchy replacement search strategy is used to \nappropriately perturb the dimensionality of the search agent and \nimprove the interaction between individuals. \nSpecifically, firstly, the grey wolf population with the number of \nindividuals N is traversed by the parameter l, and the selected one is the \nXl individual. Then, according to the ratio of the remaining runs of the \nalgorithm to the total number of runs compared with the Cauchy \nrandom number, if the Cauchy random number is less than the ratio, the \nh-th dimensional value of Xl is replaced with the hth dimensional value \nof the optimal solution α wolves. Finally, the fitness value of the updated \nXl The evaluation function calculates the optimal solution, and the \noptimal fitness value are replaced if the fitness value is better than the \noptimal solution. Otherwise, it remains unchanged. To confirm the \neffectiveness of the Cauchy replacement search strategy, RGWO in \nAblation experiments of SRXGWO is the GWO improved using this \nstrategy alone. \n3.1.3. Directional mutation strategy \nSince the original GWO relies too much on the searchability of the \ntop three ranked wolves to find the optimal solution, it is easy to fall into \nthe local optimal trap and reduce the accuracy of the optimal solution. \nTherefore, this paper proposes a directional mutation strategy based on \ngenetic algorithms ’ mutation and crossover strategies. The directional \nmutation strategy consists of two important operations: directional \ncrossover and directional variation. \n1. Directional crossover (DM) X. Zhang et al.\n\n[Página 5]\nComputers in Biology and Medicine 163 (2023) 107166\n5The when-directed crossover mechanism uses the position informa -\ntion of the current iteration ’s optimal individual to guide the in-\ndividual ’s next change trend. There are four main parameters, which are \ncrossover rate (pc), variable crossover probability (pcv), directional \nprobability (pd) and multiplication factor (α). First, the execution of the \ndirected crossover mechanism requires different parent individuals in \nthe current population. The parent individuals are generated by random \nselection from the population, pj\n1 and pj\n2, j∃1Cd]. pj\nmean and pj\nbest are the \nmean value of the parent individuals in the jth dimension and the value \nof the best individual in the jth dimension, respectively. In the first case, \nwhen pj\nbest≽pj\nmean (c1 and c2 does the directed hybridization mechanism \ngenerate the individuals). \nval1\u00000B5e⌈\n†pj\n1\u0000pj\n2†\n
yj\nu\u0000yj\nl⌉\n(16) \nβr3\nα2(17) \nc1val∗\u0000\npj\n1\u0000pj\n2)\nαr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (18) \nc2
1\u0000val∗\u0000\npj\n1\u0000pj\n2)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (19) \nc1val∗\u0000\npj\n1pj\n2)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (20) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\nα
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (21) \nWhen pj\nbestDpj\nmean. \nc1val∗\u0000\npj\n1pj\n2)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (22) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\nα
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (23) \nc1val∗\u0000\npj\n1pj\n2)\nαr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (24) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (25) \nIf the parent individuals have the same value, but pj\nbestℑpj\nmean. \nval1\u00000B5e⌈\n†pj\nbest\u0000pj\nmean†\n
yj\nu\u0000yj\nl⌉\n(26) \nβr3\nα2(27) \nc1val∗\u0000\npj\nbestpj\nmean)\nαr3∗e
1\u0000β∗
1\u0000val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4≼pd\n(28) \nc2
1\u0000val∗\u0000\npj\nbestpj\nmean)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4\n≼pd\n(29) \nc1val∗\u0000\npj\nbestpj\nmean)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4Fpd\n(30) \nc2
1\u0000val∗\u0000\npj\nbestpj\nmean)\nα
1\u0000r3∗e
\u0000β∗val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4Fpd\n(31) \nwhere r3 and r4 are two different random numbers, r3∃
0C1and r4∃\n
0C1. val and β are two parameters computed in each iteration. yj\nu and yj\nl \nare the upper and lower bounds of the individual in the jth dimension, \nrespectively. А is the multiplicative factor. \n2. Directional variation First, assume that the dimensions of population size and objective \nfunction are D and d, respectively. Assume that the current iteration \nindividual is y. The guided variation mechanism guides the variation of \nthe current iteration individual y based on the position information of \nthe current optimal individual ybest. When individual y is selected for \nguided mutation operation, the DM mechanism will compare the size of \nyj\ni and yj\nbest, if yj\nbest≽yj\ni. \nβ1e[\n2r\u00002\nr]\n(32) \nβ2e[\nr\u00002\nr]\n(33) \nym|\n〈\n⎜yj\niβ1∗\u0000\nyj\nu\u0000yj\ni)\nCifr2≼pd\nyj\ni\u0000β2∗\u0000\nyj\ni\u0000yj\nl)\nCotherwise(34) \nwhere β1 and β2 are two parameters, which can also be called the \nweights that determine the change steps of the formula. r and r2 are two \nrandom numbers, r∃
0C1and r2∃
0C1, rℑ0. yj\nu and yj\nl are the upper \nand lower bounds of the individual in the jth dimension, respectively. pd \nrepresents the orientation probability, pd∃
0B5C1. If yj\nbestDyj\ni. \nym|\n〈\n⎜yj\ni\u0000β1∗\u0000\nyj\ni\u0000yj\nl)\nCifr2≼pd\nyj\niβ2∗\u0000\nyj\nu\u0000yj\ni)\nCotherwise(35) \nTo illustrate the effectiveness of the Directional mutation strategy, \nthe XGWO in ablation experiments of SRXGWO is the GWO improved \nusing this strategy alone. \n3.1.4. Proposed SRXGWO \nThe analysis shows that GWO is an excellent algorithm with solid \nexploitation capability, but several aspects still need improvement. First, \nGWO is randomly generated with strong uncertainty in the initialization \nof the grey wolf population, which will lead to the initial solution of the \nwhole population cannot effectively cover the solution space of the \nproblem, thus causing problems such as low efficiency in the search \nphase. Secondly, the lack of information exchange among individuals in \nthe iterative process of GWO tends to make the algorithm suffer from \npoor-quality of search solutions and fall into local optimum when \ndealing with multi-peaked problems. In addition, GWO relies too much \non the exploitation ability of the top three ranked wolves to find the \noptimal solution, which cannot effectively search the whole solution \nspace, leading to the inability to find the optimal solution and reducing \nthe quality of understanding. \nTherefore, this paper addresses the above three problems and makes \ncorresponding improvements to GWO. First, Sobol sequence-based \npopulation initialization is used instead of the original random initiali -\nzation method to generate a low-sequence population of grey wolves, \nwhich covers the whole solution space uniformly. Second, the dimen -\nsional values between search agents are effectively exchanged by Cau-\nchy’s random replacement strategy to enhance the information \nexchange between individuals and improve the exploitation capability \nof the algorithm. Third, the directional mutation mechanism is intro-\nduced to perform crossover and mutation at the level of the search so-\nlution, and the crossover or mutation operation is performed for the \nnature of the current individuals, which effectively improves the search \nability of the algorithm and the ability to jump out of the local optimum. \nThe algorithm flowchart of SRXGWO as shown in Fig. 1. \n3.2. The proposed SRXGWO-SVR model \nTo accurately predict the number of patients and reasonably \nschedule medical resources, this section combines the high-performance X. Zhang et al.\n\n[Página 6]\nComputers in Biology and Medicine 163 (2023) 107166\n6SRXGWO algorithm with the SVR prediction method and proposes the \nSRXGWO-SVR, an SVR prediction model based on the improved GWO. \nAccording to Section 2.2, SVR is a supervised machine learning \nmethod with two key parameters: the penalty parameter C and the \nkernel function parameter g. The penalty parameter C affects the \ncomplexity and stability of the model, the kernel function parameter \nreflects the distribution of samples in the feature space, and the \nparameter selection directly impacts the prediction accuracy and \ngeneralization ability of the model. Therefore, to address the above is-\nsues, SRXGWO is introduced to optimize the radial basis kernel function \nparameters and penalty factors in the SVR patient-flow prediction model \nto form the best combination of parameters to improve the prediction \naccuracy and reduce the error size. The specific steps for building the \nSRXGWO-SVR model are as follows. \n(1) Data pre-processing. Routine data pre-processing is performed on \nthe collected patient-flow data, including data cleaning, missing \nvalue processing, outlier processing, etc. \n(2) Establish the objective function. The sample data are substituted \ninto the mean square error minimization function as shown in Eq. \n(26), and then the optimal radial basis kernel function parameters \nC and penalty factor γ are obtained. \nQm
CCσ1\nn̂n\nk1
yk\u0000}yksBtBC∃CminCCmaxCγ∃γminCγmax (36) \nwhere yk denotes the actual size of the patient flow, and √yk denotes the \ncorresponding size value of the patient-flow prediction. \n(3) Search for hyperparameters using SRXGWO. First, the parameters \ninvolved in the SRXGWO algorithm are set initially. The fitness \nfunction RMSE is applied to calculate the fitness values of the \npopulation individuals, where m is the number of samples. RMSE⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪\n1\nm̂m\nk1
yk\u0000}yk2̅\n(37) \n(4) Determine whether the maximum number of iterations is \nreached. The iteration is continued if the maximum number of \niterations is not reached. Suppose the maximum number of iter-\nations is reached. In that case, the C and γ corresponding to the \noptimal individual location information is output. The best \ncombination of the two parameters is applied to build the \nSRXGWO-SVR prediction model. Then the patient-flow dataset is \npredicted. \nThe flow chart of the SRXGWO-SVR prediction model based on \nhospital patient-flow proposed in this section is shown in Fig. 2. \n4.Experimental results and discussions \nIn this section, ablation and benchmark function experiments are \ndesigned to validate the global optimization performance of SRXGWO. \nThen, the proposed SRXGWO-SVR is used in patient-flow prediction \nexperiments to demonstrate the accuracy and validity of SRXGWO-SVR. \n4.1. Benchmark functions comparison experiment \n4.1.1. Benchmark test experiment setup \nFirst, the running environment of the benchmark function test \nexperiment needs to be described. the software of the experiment is \nMatlab2017b and the core hardware is Intel(R) Xeon(R) CPUE5-2660v3 \n(2.60 GHz). The benchmark function test set used in this section is the \ncurrently familiar IEEE CEC2014, described in detail in Table 1. The \ncomparison experiments include SRXGWO and GWO and well-known \nFig. 1.Algorithm flow chart of SRXGWO \nThe algorithmic complexity of SRXGWO comes \nmainly from Sobol sequences, core formula updates, \nCauchy random replacement strategy, and directional \nmutation mechanism. The complexity level of Sobol \nsequence initialization is O
N; the computational \ncomplexity level of the core formula is O
N2\nN∗logN; the computational complexity level of \nCauchy random replacement strategy is O
N∗logN; \nand the complexity level of directional mutation \nmechanism is O
N2. By comprehensive calculation, \nthe overall complexity level of SRXGWO is \nO
SRXGWOO
N2N∗logN. X. Zhang et al.\n\n[Página 7]\nComputers in Biology and Medicine 163 (2023) 107166\n7algorithms such as PSO, SCA, etc. Therefore, to ensure the validity and \nfairness of the experiments, all swarm intelligence algorithms are \nsearched in dimension 30, the population size is 30, the number of \nevaluations is also uniformly 300,000, and the internal parameters of \nthe algorithms are all default values. Finally, to ensure the correctness \nand validity of the experimental results, all the algorithms were run \nindependently 30 times, and the results of the experiments were further \nverified using Wilcoxon signed-rank test and the Friedman test. \n4.1.2. Ablation experiments \nIn this section, ablation experiments of SRXGWO were designed to \ndiscuss the effects of Sobol sequence-based population initialization, \nCauchy random replacement strategy, and directional mutation mech -\nanism on the effect of GWO. First, the experiments combined the three \nimproved strategies with GWO by permutation, including GWO itself, \nwith a total of eight algorithms, as shown in Table 2. In the table, S \nstands for Sobol sequence-based population, R stands for Cauchy \nrandom replacement strategy, and X stands for directional mutation \nmechanism. in addition, “1″ indicates that the current strategy is used, \nand “0″ indicates that no strategy is used. For example, SGWO uses the \nSobol sequence but not the other two strategies. \nTable 3 shows the experimental results of SRXGWO with the other \nseven algorithms, including the Wilcoxon signed-rank test results and P- \nvalue. The number of algorithms that are “better than/equal to/worse \nthan ” other algorithms. “Mean ” indicates the average ranking of the 30 \nfunctions tested, and “rank ” indicates the final overall ranking. In the \nresults of the Wilcoxon test, SRXGWO is 23 better than the unimproved \nGWO, which indicates that the improvement of GWO by the three \nimprovement strategies is very significant. In addition, SRXGWO has a \nsignificant advantage over SGWO, RGWO, and XGWO using a single \nmechanism, with at least 14 stronger than them. Finally, SRXGWO has \nan advantage over the two-two combination of SRGWO, SXGWO, and \nRXGWO, indicating that the three SRXGWO improvement strategies are \neffective. The table also shows the empirical p-values, and the bolded data indicate that SRXGWO is significantly different from other algo-\nrithms, and it can be said that the advantage of SRXGWO is more \nprominent compared to other algorithms. In summary, the mechanism \nemployed in SRXGWO is reasonable and effective, and can significantly \nimprove the performance of GWO. \n4.1.3. Comparison of SRXGWO with well-known peer algorithms \nIn this subsection, similar algorithm comparison experiments are \ndesigned based on 30 benchmark functions to compare SRXGWO with \n12 other peer algorithms to demonstrate that the proposed algorithm \nhas more robust optimization performance among the same type of al-\ngorithms. Among the compared algorithms, six original algorithms are \nPSO, SCA, MFO, WOA, BA, and FA, all highly cited algorithms. The other \nsix algorithms are new variants proposed recently, including OBSCA, \nm_SCA, OBLGWO, ACWOA, MOFOA, and SCADE. \nTable 4 shows the experimental results of the comparison. Where \nAVG denotes the average optimal fitness value of 30 independent ex-\nperiments, STD denotes the variance of the experiments, and the bolded \ndata are the optimal values of the current function of the algorithm. In \nthe experimental results, SRXGWO finds the optimal solution relative to \nits peer algorithms in most of the function evaluations, especially in the \nclass of complex functions F23–F30, which indicates that SRXGWO is \nmore advantageous in dealing with complex problems. In addition, the \nSTD fluctuation of SRXGWO is small, which suggests that the algorithm \nhas strong stability. \nSimilarly, to further validate the SRXGWO experimental results, we \nused the Wilcoxon signed-rank test to compare and validate SRXGWO, \nand the results are shown in Table 5; the Friedman test was used to verify \nthe average ranking of SRXGWO, and the results are shown in Fig. 3, \nwhich can be more intuitive to observe the comparison results. The \nWilcoxon signed-rank test results show that SRXGWO ranks first overall \nwhen comparing other algorithms and is at least 19 better than other \nhigh citation algorithms and 20 better than other variants. The Friedman \ntest shows that the average ranking of SRXGWO is slightly different, but \nFig. 2.SRXGWO-SVR prediction model based on hospital patient-flow. X. Zhang et al.\n\n[Página 8]\nComputers in Biology and Medicine 163 (2023) 107166\n8it is still better than PSO and MFO algorithms, and the overall perfor -\nmance is also the first. In summary, the results of the comparison \nexperiment are valid and reasonable, and SRXGWO does outperform \nother peer algorithms. \nTo further demonstrate the advantages of SRXGWO over other al-\ngorithms, this experiment recorded the optimization search process of \neach algorithm and plotted it as an iterative curve, as shown in Fig. 4. \nThe horizontal coordinate indicates the number of evaluations, and the \nvertical coordinate indicates the fitness value. Firstly, it can be seen that \nSRXGWO has good convergence accuracy on F6, F8, F9, F10, F11 and \nF13 in unimodal and simple multimodal function classification and \nfaster search speed than other similar algorithms. In addition, it can be \nobserved in the hybrid and combinatorial functions F16, F23, F30 that \nSRXGWO also has excellent results in solving complex optimization \nproblems. Further in the figure, SRXGWO has a clear advantage in the \nF6, F8, F9, F10, and F16 test functions. Both in the search period of the \nsearch process and the exploitation period of the iteration, SRXGWO can \nquickly find the current optimal solution. At the same time, the other algorithms cannot outperform SRXGWO from the beginning to the end. \nIn addition, SRXGWO has a clear decreasing inflection point in the \nmiddle of the algorithm iteration in the function tests of F11 and F16. \nFew other algorithms can continue the development, which indicates \nthat SRXGWO has a strong ability to jump out of the local optimum. \nFinally, the nine function tests in the figure demonstrate that SRXGWO \nhas stronger search and exploitation capabilities than other algorithms \nand is a high-performance optimization algorithm. In future work, it also \nbe applied to more cases, such as optimization of machine learning \nmodels [65], MRI reconstruction [66], service ecosystem [67], compu -\ntational experiments [68,69], power distribution network [70], and \nmedical signals [71,72]. \n4.2. Patient-flow prediction \nThe patient-flow dataset is presented in this section, and SRXGWO- \nSVR training and test experiments are designed. First, the patient flow \ndataset used is presented. Immediately after, the experimental setup \nincluding comparison methods, parameter settings, and evaluation \ncriteria are described. Finally, SRXGWO-SVR is proposed and applied to \nthe prediction of patient flow. \n4.2.1. Patient-flow dataset \nThe data set used in this section is the attendance statistics of \nWenzhou Medical University Hospital in China, which serves a radius of \nnearly 30 million people and has an annual outpatient volume of 5.3 \nmillion. Due to the large volume of data, the latest data from January \n2022 to September 2022 is selected, with a sample size of 240 items. The \ndata ’s main characteristic attribute is “number of appointments, ” and \nthe label attribute is “number of actuals ”. In addition, to reduce the \ndependence of the model on a single time series and the error of the \nprediction results, this paper also selects three independent attribute \nseries, namely, “number of people without pre-deposit system ”, “num-\nber of people without ID”, and “number of late arrivals ”. “Three inde-\npendent attribute series are selected to describe the trend changes of \npatient-flow with the influence of multiple factors. Finally, when col-\nlecting data, there are inevitably null values and outliers, and this paper \nalso preprocesses the data by removing abnormal samples and linear \ninterpolation. Fig. 5 shows a 240-day line graph of actual hospital visits. \nFirst of all, according to Fig. 5, we can see that the number of hospital \nvisits as a whole fluctuates a lot, and there is a local repetition, mostly \nbetween 14,000 and 4,200 visits. The main reason for this phenomenon \nis that the 14,000 visits are during the weekdays, i.e., Monday through \nFriday, when the hospital doctors are in regular attendance and the \nequipment is functioning normally, and the number of visits is relatively \nhigher. The 4,200 visits are due to the fact that most of the departments \nand facilities are closed during the weekends, and the number of visits is \nrelatively low. In addition, it can be seen that the average number of \nhospital visits between 180 and 220 days was very high, reaching \n18,000 at one point, and the number of weekend visits did not drop too \nmuch. This is because this period corresponds to July and August, which \nis the free time of summer vacation, and most people will concentrate on \ntheir visits during this period. In general, this data set shows a cyclical \ndistribution, and the difficulty in building the model is to reduce the \nerror while avoiding the problem of overfitting. \n4.2.2. Experimental setup \nFirst, the numerical settings of the SRXGWO and GWO algorithms \nused for hyperparameter optimization are presented. The number of \npopulations is set to 20, the dimension is defined as 2, the maximum \nnumber of iterations is 50, the upper and lower bounds for the value of C \nare 100 and 0.1, and the upper and lower bounds for the value of R are \nalso 100 and 0.1. Then, to prove the effectiveness of the prediction \nmodel SRXGWO-SVR improvement, the SRXGWO-SVR was compared \nwith GWO-SVR and the original SVR in the experiments. Also, to prove \nthe effectiveness of SRXGWO-SVR model, backpropagation (BP), Table 1 \nDescription of the 30 benchmark functions. \nClass No. Functions F∗\ni\nFi
x∗\nUnimodal Functions 1 Rotated High Conditioned Elliptic \nFunction 100 \n2 Rotated Bent Cigar Function 200 \n3 Rotated Discus Function 300 \nSimple Multimodal \nFunctions 4 Shifted and Rotated Rosenbrock ’s \nFunction 400 \n5 Shifted and Rotated Ackley ’s Function 500 \n6 Shifted and Rotated Weierstrass \nFunction 600 \n7 Shifted and Rotated Griewank ’s \nFunction 700 \n8 Shifted Rastrigin ’s Function 800 \n9 Shifted and Rotated Rastrigin ’s Function 900 \n10 Shifted Schwefel ’s Function 1000 \n11 Shifted and Rotated Schwefel ’s Function 1100 \n12 Shifted and Rotated Katsuura Function 1200 \n13 Shifted and Rotated HappyCat Function 1300 \n14 Shifted and Rotated HGBat Function 1400 \n15 Shifted and Rotated Expanded \nGriewank ’s plus Rosenbrock ’s Function 1500 \n16 Shifted and Rotated Expanded Scaffer ’s \nF6 Function 1600 \nHybrid Functions 17 Hybrid Function 1 (N 3) 1700 \n18 Hybrid Function 2 (N 3) 1800 \n19 Hybrid Function 3 (N 4) 1900 \n20 Hybrid Function 4 (N 4) 2000 \n21 Hybrid Function 5 (N 5) 2100 \n22 Hybrid Function 6 (N 5) 2200 \nComposition \nFunctions 23 Composition Function 1 (N 5) 2300 \n24 Composition Function 2 (N 3) 2400 \n25 Composition Function 3 (N 3) 2500 \n26 Composition Function 4 (N 5) 2600 \n27 Composition Function 5 (N 5) 2700 \n28 Composition Function 6 (N 5) 2800 \n29 Composition Function 7 (N 3) 2900 \n30 Composition Function 8 (N 3) 3000 \nTable 2 \nGWO variants based on three strategies. \nAlgorithms S R X \nSRXGWO 1 1 1 \nGWO 0 0 0 \nSGWO 1 0 0 \nRGWO 0 1 0 \nXGWO 0 0 1 \nSRGWO 1 1 0 \nSXGWO 1 0 1 \nRXGWO 0 1 0 X. Zhang et al.\n\n[Página 9]\nComputers in Biology and Medicine 163 (2023) 107166\n9random forest (RF), KELM, radial basis function network (RBF), con-\nvolutional neural networks (CNN), and other well-known predictive \nclassifiers are added to the comparison experiments. To verify the pre-\ndiction effectiveness of the proposed patient-flow prediction models, \nthree evaluation metrics are applied to evaluate the performance of \nvarious prediction models in this paper. The three-evaluation metrics \nare the spearman correlation coefficient (R2) of Eq. (38), the mean ab-\nsolute error (MAE) of Eq. (39), and the root mean square error (RMSE) of \nEq. (40) for the evaluation analysis. \nR21\u0000⋃m\nk1
yk\u0000}yk2\n⋃m\nk1
yk\u0000}yk2(38) \nMAE1\nm̂m\u00001\ni0†yi\u0000}yi† (39) \nRMSE⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪\n1\nn̂m\nk1
yk\u0000}yk2̅\n(40) \nwhere m is the number of samples, yk is defined as the actual value size \nof the test sample, yk is the mean size of the test sample, and √yk is the \npredicted value of the test sample. \n4.2.3. Prediction results and analysis \nTo perform regression calculations on the decomposed subsequences \nusing the SVR model, the patient-flow data set needs to meet the input \nformat of the SVR model. For this purpose, the original data samples are \nprocessed as follows. \nFirst, for the time series y1Cy2C…yn, define the input matrix. X⎫\n⎭y1⋯ yd\n⋮ ⋱ ⋮\nyn\u0000d⋯ yn\u00001⎩\n⎨ (41) \nwhere d is the step size parameter and is the number of sample attri-\nbutes, which in this paper is 4. \nThen, define the output labels. \ny⎫\n⎭yd1\n⋮\nyn⎩\n⎨ (42) \nFinally, use X and y defined above as the input and label of the SVR \nmodel, respectively. In practice, X and y are divided into a training set \nand a test set in the ratio of 1:1. The training set is used to train the \nmodel and determine the optimal parameters of the model. Then, the \ntrained model is simulated and tested on the test set to demonstrate the \ntraining effect of the prediction model. Finally, the accuracy perfor -\nmance of the model is verified by evaluating the metrics R2, RMSE, and \nMAE. The following are the experimental results and training and test \nsets analysis. \n1. Prediction experiments on the training set \nThe patient–flow dataset is divided into 120 sample sets by 1:1 \ncrossover as the training set for training seven prediction models, \nincluding SRXGWO-SVR, GWO-SVR, SVR, BP, RF, KELM, RBF, and CNN. \nFig. 6 shows the prediction result plot of SRXGWO-SVR. The original \nfold represents the training set’s original data distribution and the Pre-\ndicted fold represents the prediction results given by the SRXGWO-SVR \nmodel. The line graph shows that the overall prediction effect of the \nSRXGWO-SVR model is excellent, especially in the interval of 70–120 \ndays. The Original and Predicted lines nearly overlap, which indicates \nthat the prediction is very accurate. The large deviations between the Table 3 \nResults of Wilcoxon signed-rank test for ablation experiments and P-value. \nItem SRXGWO GWO SGWO RGWO XGWO SRGWO SXGWO RXGWO \n/\u0000/ ~ 23/1/6 15/1/14 14/3/13 18/0/12 6/5/19 9/0/21 9/2/19 \nMean 2.57 6.90 5.40 4.47 4.67 2.93 4.13 3.53 \nRank 1 8 7 5 6 2 4 3 \nF1 N/A 1.9209E-06 1.0246E-05 4.0483E-01 4.7162E-02 2.8948E-01 9.7772E-02 1.6503E-01 \nF2 N/A 1.9209E-06 1.9209E-06 8.3071E-04 1.6394E-05 2.4118E-04 3.7243E-05 3.3269E-02 \nF3 N/A 1.7344E-06 1.7344E-06 6.0350E-03 8.9364E-01 6.8359E-03 6.2683E-02 3.1849E-01 \nF4 N/A 2.3704E-05 3.8822E-06 6.2884E-01 3.6094E-03 4.4052E-01 7.8647E-02 5.9994E-01 \nF5 N/A 1.7344E-06 2.6033E-06 6.8923E-05 1.7344E-06 4.1955E-04 1.7344E-06 8.1302E-01 \nF6 N/A 4.7162E-02 3.1618E-03 7.0356E-01 4.4052E-01 9.0993E-01 9.0993E-01 9.0993E-01 \nF7 N/A 1.7344E-06 1.7344E-06 1.1499E-04 1.2453E-02 4.0715E-05 3.1618E-03 6.5833E-01 \nF8 N/A 1.7344E-06 1.9209E-06 9.3676E-02 1.7344E-06 1.9861E-01 2.3534E-06 7.1889E-01 \nF9 N/A 3.6004E-01 2.9894E-01 2.4308E-02 8.6121E-01 5.5774E-01 2.2888E-01 7.0356E-01 \nF10 N/A 1.7344E-06 1.7344E-06 4.7162E-02 2.1266E-06 4.7162E-02 1.9209E-06 6.2884E-01 \nF11 N/A 7.3433E-01 3.0861E-01 5.0383E-01 4.1653E-01 1.3591E-01 9.2626E-01 5.5774E-01 \nF12 N/A 8.2206E-02 5.4401E-01 5.9836E-02 3.3173E-04 7.7309E-03 1.1079E-02 3.6004E-01 \nF13 N/A 2.2102E-01 3.9333E-01 1.8462E-01 5.5774E-01 2.9894E-01 3.1849E-01 4.1653E-01 \nF14 N/A 1.3975E-02 1.8326E-03 2.6230E-01 8.5896E-02 1.2544E-01 1.7791E-01 2.3694E-01 \nF15 N/A 1.4773E-04 6.3391E-06 3.6826E-02 4.9080E-01 2.7653E-03 1.8462E-01 1.0201E-01 \nF16 N/A 5.3197E-03 2.9575E-03 1.1138E-03 7.5213E-02 2.5637E-02 1.7138E-01 6.5641E-02 \nF17 N/A 9.8421E-03 3.0861E-01 3.1849E-01 8.7297E-03 3.8723E-02 7.1889E-01 6.5833E-01 \nF18 N/A 6.8359E-03 9.3157E-06 8.5896E-02 8.9187E-05 6.5641E-02 1.4936E-05 1.4773E-04 \nF19 N/A 1.4839E-03 8.9443E-04 1.9861E-01 6.4352E-01 1.3591E-01 2.0589E-01 2.1827E-02 \nF20 N/A 1.9209E-06 1.7344E-06 5.3070E-05 5.3044E-01 5.2165E-06 1.5886E-01 7.3433E-01 \nF21 N/A 9.0993E-01 4.7795E-01 7.5213E-02 1.0639E-01 2.1827E-02 8.2901E-01 5.0383E-01 \nF22 N/A 1.6503E-01 6.5641E-02 7.1903E-02 1.6503E-01 3.8203E-01 1.8519E-02 2.4519E-01 \nF23 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF24 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF25 N/A 1.2290E-05 1.0000E00 1.7344E-06 5.6061E-06 1.0000E00 1.0000E00 1.7344E-06 \nF26 N/A 1.9729E-05 1.6566E-02 1.0357E-03 1.3820E-03 1.5286E-01 3.1603E-02 3.1618E-03 \nF27 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF28 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF29 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF30 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 X. Zhang et al.\n\n[Página 10]\nComputers in Biology and Medicine 163 (2023) 107166\n10Table 4 \nComparison results of SRXGWO with other algorithms. \nFun F1 F2 F3 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.5817E07 8.9158E06 1.7773E08 1.4681E08 4.5005E03 3.3271E03 \nPSO 9.0808E06 1.6903E06 1.4837E08 1.5123E07 9.9378E02 1.2790E�02 \nSCA 2.2839E08 6.9799E07 1.6889E10 2.3915E09 3.7046E04 6.6934E03 \nMFO 8.7549E07 1.0414E08 1.0114E10 5.9855E09 1.0275E05 5.8223E04 \nWOA 2.7540E07 1.1331E07 5.0637E06 8.0209E06 3.2575E04 2.0632E04 \nBA 7.7059E�05 3.5272E�05 5.2698E�05 2.7431E�05 4.2251E�02 1.6464E02 \nFA 2.5269E08 5.1675E07 1.5002E10 1.8122E09 6.4325E04 1.0623E04 \nOBSCA 4.0160E08 1.2958E08 2.4801E10 4.7138E09 5.0550E04 9.2351E03 \nm_SCA 6.3874E07 4.1104E07 6.3318E09 3.7149E09 2.6908E04 6.6947E03 \nOBLGWO 2.2042E07 1.2605E07 1.6887E07 1.2778E07 9.1358E03 3.3451E03 \nACWOA 1.3860E08 6.2461E07 7.4290E09 3.9581E09 5.0191E04 9.0562E03 \nMOFOA 1.2354E09 7.4867E07 7.7038E10 2.4594E09 7.8687E04 3.7238E03 \nSCADE 4.5429E08 1.1842E08 3.0003E10 4.0210E09 5.6160E04 7.2834E03 \nFun F4 F5 F6 \nItem AVG STD AVG STD AVG STD \nSRXGWO 5.4006E02 3.2112E01 5.2075E02 7.2959E-02 6.1118E�02 2.5044E00 \nPSO 4.6707E02 3.2003E�01 5.2095E02 4.0216E-02 6.2317E02 3.2594E00 \nSCA 1.4150E03 2.7588E02 5.2093E02 6.2064E-02 6.3356E02 2.3449E00 \nMFO 1.5209E03 1.0125E03 5.2030E�02 1.6938E-01 6.2361E02 3.5309E00 \nWOA 5.9251E02 6.0017E01 5.2034E02 1.6112E-01 6.3494E02 3.5778E00 \nBA 4.2155E�02 3.2061E01 5.2095E02 6.4791E-02 6.3398E02 3.6948E00 \nFA 1.5337E03 1.5192E02 5.2096E02 4.5044E-02 6.3359E02 9.2350E-01 \nOBSCA 2.3121E03 7.5405E02 5.2095E02 5.7443E-02 6.3205E02 1.4049E00 \nm_SCA 8.0286E02 1.1489E02 5.2056E02 1.4351E-01 6.2212E02 2.8889E00 \nOBLGWO 5.4647E02 4.7860E01 5.2096E02 5.9910E-02 6.1916E02 3.3318E00 \nACWOA 1.1803E03 2.6266E02 5.2085E02 1.7768E-01 6.3363E02 2.7978E00 \nMOFOA 1.0092E04 6.9816E02 5.2106E02 3.7558E-02 6.4079E02 6.7902E-01 \nSCADE 2.2480E03 4.6553E02 5.2097E02 4.3335E-02 6.3428E02 2.4021E00 \nFun F7 F8 F9 \nItem AVG STD AVG STD AVG STD \nSRXGWO 7.0144E02 4.4844E-01 8.3494E�02 6.4659E�00 9.9741E�02 2.4837E01 \nPSO 7.0229E02 1.4348E-01 9.7268E02 2.6092E01 1.1067E03 2.4938E01 \nSCA 8.4528E02 2.6369E01 1.0362E03 1.9353E01 1.1756E03 2.4065E01 \nMFO 7.9627E02 6.3419E01 9.4824E02 3.3320E01 1.1205E03 4.4316E01 \nWOA 7.0099E02 7.2969E-02 9.9955E02 4.1935E01 1.1246E03 5.0520E01 \nBA 7.0066E�02 1.6102E-01 1.0275E03 5.2626E01 1.1641E03 5.6092E01 \nFA 8.4000E02 1.0997E01 1.0240E03 1.2118E01 1.1595E03 1.3038E01 \nOBSCA 9.1758E02 4.4244E01 1.0576E03 1.8074E01 1.1960E03 1.9095E01 \nm_SCA 7.4867E02 2.2125E01 9.3470E02 2.3339E01 1.0491E03 1.9402E01 \nOBLGWO 7.0119E02 9.2779E-02 9.2058E02 3.4783E01 1.0637E03 2.9684E01 \nACWOA 7.3883E02 2.1566E01 9.8681E02 1.5413E01 1.1270E03 1.7226E01 \nMOFOA 1.4082E03 4.6569E01 1.1760E03 1.1881E01 1.2583E03 9.4200E�00 \nSCADE 9.1691E02 4.4469E01 1.0684E03 1.0564E01 1.2058E03 1.8217E01 \nFun F10 F11 F12 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.7815E�03 2.3016E�02 4.1565E�03 1.0677E03 1.2012E03 3.7913E-01 \nPSO 5.0248E03 5.6761E02 5.8289E03 4.4923E02 1.2023E03 3.0765E-01 \nSCA 7.0064E03 5.2529E02 8.0775E03 3.0696E02 1.2025E03 2.1633E-01 \nMFO 4.6021E03 8.7516E02 5.2295E03 7.7681E02 1.2004E�03 1.9653E-01 \nWOA 4.9691E03 7.4150E02 5.8744E03 9.0861E02 1.2017E03 4.7579E-01 \nBA 5.5034E03 5.6881E02 6.0313E03 6.9746E02 1.2011E03 3.5842E-01 \nFA 7.5532E03 3.1957E02 7.9058E03 2.9315E02 1.2026E03 2.3995E-01 \nOBSCA 6.3076E03 4.9831E02 7.3709E03 3.6056E02 1.2022E03 4.1510E-01 \nm_SCA 4.0584E03 7.1133E02 4.7823E03 6.5478E02 1.2008E03 3.3864E-01 \nOBLGWO 3.8703E03 8.9566E02 5.4446E03 1.0838E03 1.2023E03 5.7151E-01 \nACWOA 4.7309E03 7.3276E02 6.1655E03 9.3475E02 1.2018E03 4.7511E-01 \nMOFOA 9.2300E03 3.9968E02 9.0883E03 2.8283E�02 1.2029E03 2.7367E-01 \nSCADE 7.3914E03 2.4356E02 8.2418E03 2.8346E02 1.2026E03 2.4238E-01 \nFun F13 F14 F15 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.3004E�03 7.4709E-02 1.4005E03 2.8662E-01 1.5163E03 6.0201E00 \nPSO 1.3004E03 7.7571E-02 1.4003E03 1.2817E-01 1.5166E03 1.1804E�00 \nSCA 1.3030E03 2.6429E-01 1.4439E03 7.6871E00 5.5707E03 5.0710E03 \nMFO 1.3020E03 1.3201E00 1.4347E03 2.4514E01 2.1529E05 5.9281E05 \nWOA 1.3006E03 1.4348E-01 1.4003E�03 4.2398E-02 1.5738E03 2.6213E01 \nBA 1.3005E03 1.5518E-01 1.4003E03 1.3344E-01 1.5296E03 6.4355E00 \nFA 1.3028E03 1.9987E-01 1.4404E03 4.2258E00 1.4383E04 5.6495E03 \n(continued on next page) X. Zhang et al.\n\n[Página 11]\nComputers in Biology and Medicine 163 (2023) 107166\n11Table 4 (continued ) \nFun F1 F2 F3 \nOBSCA 1.3037E03 3.6249E-01 1.4731E03 1.1450E01 1.7595E04 1.0828E04 \nm_SCA 1.3009E03 7.5448E-01 1.4172E03 7.0904E00 2.1370E03 8.9061E02 \nOBLGWO 1.3005E03 1.1306E-01 1.4004E03 1.7893E-01 1.5162E�03 4.9642E00 \nACWOA 1.3015E03 1.0565E00 1.4197E03 1.4944E01 2.0795E03 6.3700E02 \nMOFOA 1.3081E03 3.0417E-01 1.6411E03 9.7254E00 2.2096E05 3.2757E04 \nSCADE 1.3040E03 3.7540E-01 1.4874E03 8.7317E00 1.9117E04 6.0793E03 \nFun F16 F17 F18 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.6110E�03 4.7850E-01 6.0312E05 7.0192E05 1.0517E04 9.4630E03 \nPSO 1.6120E03 5.3328E-01 2.9096E05 1.3413E05 1.9795E06 5.9660E05 \nSCA 1.6127E03 2.3363E-01 6.2791E06 3.3409E06 1.4952E08 6.6811E07 \nMFO 1.6128E03 4.8942E-01 3.9449E06 7.4952E06 1.2984E08 4.9905E08 \nWOA 1.6124E03 4.0816E-01 4.2933E06 3.4224E06 7.9323E�03 5.7540E�03 \nBA 1.6133E03 3.0344E-01 1.0170E�05 9.2079E�04 9.5662E04 4.7410E04 \nFA 1.6129E03 2.1659E-01 6.7984E06 1.7537E06 3.0346E08 8.6628E07 \nOBSCA 1.6130E03 2.5196E-01 9.4571E06 3.4434E06 1.8689E08 1.1951E08 \nm_SCA 1.6114E03 7.5109E-01 1.7735E06 1.3758E06 2.3094E07 3.3825E07 \nOBLGWO 1.6120E03 4.3556E-01 1.2085E06 9.3079E05 3.4535E04 3.1262E04 \nACWOA 1.6122E03 4.8843E-01 1.5272E07 1.2808E07 5.6959E07 4.4496E07 \nMOFOA 1.6134E03 2.3187E-01 8.7256E07 2.6488E07 5.7808E09 1.0374E09 \nSCADE 1.6127E03 2.0380E-01 1.5384E07 5.7531E06 1.6460E08 8.4537E07 \nFun F19 F20 F21 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.9173E03 1.3658E01 2.9079E03 1.1572E03 3.7294E05 3.3804E05 \nPSO 1.9172E03 1.9835E�00 2.2959E�03 6.5024E�01 1.1324E05 6.7643E04 \nSCA 1.9893E03 2.5079E01 1.5103E04 3.7270E03 1.4348E06 6.6889E05 \nMFO 1.9738E03 5.5538E01 5.2933E04 4.0442E04 1.0824E06 2.5030E06 \nWOA 1.9384E03 2.7102E01 3.2328E04 2.0050E04 1.1294E06 1.7119E06 \nBA 1.9335E03 3.4019E01 2.4023E03 1.1992E02 6.4514E�04 3.2131E�04 \nFA 2.0050E03 1.2211E01 1.8924E04 7.2016E03 1.6271E06 7.2788E05 \nOBSCA 2.0080E03 9.9922E00 2.9021E04 1.1924E04 1.8445E06 8.5563E05 \nm_SCA 1.9502E03 2.9621E01 1.0791E04 4.6890E03 3.7774E05 5.2345E05 \nOBLGWO 1.9170E�03 1.6997E01 5.6962E03 2.3328E03 5.2217E05 3.6592E05 \nACWOA 2.0080E03 2.5161E01 3.9788E04 1.9571E04 6.9036E06 5.4050E06 \nMOFOA 2.2412E03 1.8281E01 1.4788E05 5.3365E04 3.9619E07 1.4979E07 \nSCADE 2.0087E03 1.1766E01 2.8049E04 9.6164E03 2.3498E06 1.0171E06 \nFun F22 F23 F24 \nItem AVG STD AVG STD AVG STD \nSRXGWO 2.6550E03 1.8361E02 2.5000E�03 0.0000E�00 2.6000E�03 0.0000E�00 \nPSO 2.9439E03 1.8435E02 2.6161E03 5.8346E-01 2.6261E03 5.6750E00 \nSCA 2.9644E03 1.3112E02 2.6653E03 1.3746E01 2.6001E03 6.9049E-02 \nMFO 3.0695E03 2.1885E02 2.6708E03 3.4130E01 2.6722E03 2.7522E01 \nWOA 3.0538E03 2.9728E02 2.6334E03 1.0652E01 2.6118E03 3.7279E01 \nBA 3.3420E03 4.1760E02 2.6152E03 3.0962E-03 2.6654E03 2.6008E01 \nFA 3.0002E03 1.1217E�02 2.7329E03 1.7512E01 2.7050E03 4.5757E00 \nOBSCA 3.1226E03 1.6474E02 2.6858E03 1.7839E01 2.6000E03 3.0468E-04 \nm_SCA 2.6046E�03 2.1219E02 2.6370E03 6.7666E00 2.6000E03 6.8563E-04 \nOBLGWO 2.7106E03 1.7350E02 2.6181E03 1.4048E00 2.6009E03 5.0249E00 \nACWOA 3.1046E03 2.2793E02 2.5122E03 4.6578E01 2.6000E03 5.0998E-06 \nMOFOA 1.8112E04 1.1960E04 2.5000E03 0.0000E00 2.6000E03 0.0000E00 \nSCADE 3.1435E03 1.3870E02 2.5000E03 0.0000E00 2.6000E03 1.9769E-07 \nFun F25 F26 F27 \nItem AVG STD AVG STD AVG STD \nSRXGWO 2.7000E�03 0.0000E�00 2.7004E�03 8.2706E-02 2.9000E�03 0.0000E�00 \nPSO 2.7118E03 7.4419E00 2.7871E03 3.4604E01 3.4367E03 2.8726E02 \nSCA 2.7269E03 8.2372E00 2.7023E03 6.7894E-01 3.4443E03 3.2075E02 \nMFO 2.7194E03 1.1345E01 2.7024E03 1.2575E00 3.6640E03 1.4731E02 \nWOA 2.7153E03 1.6594E01 2.7005E03 1.3903E-01 3.8579E03 2.9527E02 \nBA 2.7314E03 1.2072E01 2.7005E03 1.5158E-01 3.8975E03 3.7586E02 \nFA 2.7336E03 3.7833E00 2.7024E03 3.2727E-01 3.7997E03 2.1775E01 \nOBSCA 2.7000E03 1.4243E-08 2.7040E03 4.1439E-01 3.2568E03 4.0280E01 \nm_SCA 2.7124E03 4.1923E00 2.7008E03 2.1587E-01 3.1851E03 1.2821E02 \nOBLGWO 2.7000E03 0.0000E00 2.7006E03 1.2740E-01 3.1171E03 3.1805E02 \nACWOA 2.7000E03 0.0000E00 2.7636E03 4.8645E01 3.7129E03 3.5075E02 \nMOFOA 2.7000E03 0.0000E00 2.7925E03 2.3425E01 2.9000E03 0.0000E00 \nSCADE 2.7000E03 0.0000E00 2.7070E03 1.7566E01 3.2989E03 1.9042E02 \nFun F28 F29 F30 \nItem AVG STD AVG STD AVG STD \n(continued on next page) X. Zhang et al.\n\n[Página 12]\nComputers in Biology and Medicine 163 (2023) 107166\n12real and predicted series appear on the 3rd day, around the 32nd day, \netc., due to the large fluctuations of the real series, which are difficult to \npredict and lead to the deviation of the model. \nTo illustrate the improvement of SRXGWO-SVR compared to GWO- \nSVR, the iteration curves when SRXGWO and GWO optimized SVR are \nrecorded in this paper, as shown in Fig. 7. The vertical axis represents the fitness value of the swarm intelligence algorithm, i.e., the deviation \nin the model, and the horizontal axis represents the number of iterations. \nThe blue curve represents the iteration curve of SRXGWO-SVR, and the \nbrown curve represents the iteration curve of GWO-SVR. The iterations \nalso confirm that the two hyperparameters of the SRXGWO-SVR pre-\ndiction model are C 76.2569 and R 0.0101. The hyperparameters of \nthe GWO-SVR are C 2.3654 and R 0.0309. Since the overall de-\nviations of both SRXGWO-SVR and GWO-SVR are small, and the process \nof iteration spans an extensive numerical range, we have enlarged the \nkey parts were enlarged. First, in terms of initialization, SRXGWO-SVR \nhas a smaller fitness value than GWO-SVR, which indicates that the \nSobol sequence initialization method enhances the pre-search capability \nof SRXGWO. Then, it can be seen by the magnified image that both \nSRXGWO and GWO find the near-optimal solution at the iteration \nnumber of 2, but it is evident that SRXGWO has a better fitness value for \nthe near-optimal solution. Finally, during the iterations, SRXGWO also \nkeeps searching for the optimal solution, and the fitness value of \nSRXGWO is optimized from 0.0003285 at the beginning to 0.0003271. \nThe fitness value of GWO does not change significantly, and the algo-\nrithm falls into a local optimum. Therefore, it can be said that SRXGWO \ncan improve SVR’s prediction performance more effectively than GWO. \nThis work compares SRXGWO-SVR with well-known classification \nprediction models including GWO-SVR, SVR, BP, RF, KELM, RBF, and Table 4 (continued ) \nFun F1 F2 F3 \nSRXGWO 3.0000E�03 0.0000E�00 3.1000E�03 0.0000E�00 3.2000E�03 0.0000E�00 \nPSO 6.8849E03 8.7157E02 7.4382E04 1.3763E05 1.1678E04 6.2526E03 \nSCA 4.7736E03 2.6752E02 1.2836E07 7.6163E06 2.3980E05 7.9328E04 \nMFO 3.9703E03 2.4525E02 3.6610E06 3.9023E06 6.3694E04 5.2942E04 \nWOA 5.0223E03 6.7902E02 6.3246E06 4.5803E06 7.5080E04 4.8586E04 \nBA 5.1296E03 5.6070E02 3.6448E07 2.6098E07 1.3731E04 1.2024E04 \nFA 4.2282E03 1.4435E02 3.1490E06 8.4923E05 1.7420E05 3.9597E04 \nOBSCA 5.3567E03 2.9466E02 2.0712E07 9.7835E06 3.7443E05 1.9299E05 \nm_SCA 3.8890E03 1.2875E02 1.9729E06 4.4218E06 5.5540E04 2.8810E04 \nOBLGWO 3.4266E03 5.0458E02 4.9452E06 4.3781E06 1.9074E04 1.4566E04 \nACWOA 4.3232E03 1.2224E03 1.8950E07 1.5200E07 3.7383E05 2.2958E05 \nMOFOA 3.0000E03 0.0000E00 3.1000E03 0.0000E00 3.2000E03 0.0000E00 \nSCADE 4.9933E03 8.5262E02 1.5512E07 9.5368E06 4.8922E05 1.6393E05 \nTable 5 \nWilcoxon signed-rank test results of SRXGWO versus other peers. \nAlgorithm /\u0000/ Mean Rank \nSRXGWO ~ 2.13 1 \nPSO 19/8/3 4.80 4 \nSCA 30/0/0 8.57 9 \nMFO 26/2/2 7.33 7 \nWOA 25/4/1 6.13 6 \nBA 20/7/3 5.93 5 \nFA 30/0/0 9.47 10 \nOBSCA 29/0/1 9.70 11 \nm_SCA 26/2/2 4.73 3 \nOBLGWO 20/2/8 4.00 2 \nACWOA 28/0/2 7.57 8 \nMOFOA 23/0/7 10.17 13 \nSCADE 27/0/3 9.87 12 \nFig. 3.Friedman test results of SRXGWO versus other peers. X. Zhang et al.\n\n[Página 13]\nComputers in Biology and Medicine 163 (2023) 107166\n13\nFig. 4.Convergence curves of SRXGWO and peer algorithms. \nFig. 5.240-day folding graph of the number of actual hospital visits. X. Zhang et al.\n\n[Página 14]\nComputers in Biology and Medicine 163 (2023) 107166\n14CNN to further highlight the benefits of SRXGWO-SVR. It uses R2, RMSE, \nand MAE to assess the accuracy of the predictions. In order to guarantee \nthe stability of the prediction results and prevent chance mistakes, the \n10-fold cross-validation is also utilised in the model training process. \nTable 6 displays the evaluation findings for each model, and it is clear \nthat SRXGWO-SVR performs the best in terms of R2, RMSE, and MAE \nassessment indices. The correlation coefficient, R2, is 0.99879, which \nshows that there is a strong connection between the prediction results of \nthe SRXGWO-SVR model and the actual value. It is clear that SRXGWO- \nSVR performs best in R2, RMSE, and MAE evaluation indices. RMSE and \nMAE are used to evaluate errors. The two forms of SVR errors are the \nleast, with corresponding values of 159.5753 and 100.0009. Following \nline graph analysis, iterative graph analysis, and evaluation result \nanalysis, it can be shown that the SRXGWO-SVR model has a very high \nprediction accuracy and also has more advantages than other \nFig. 6.Prediction results of SRXGWO-SVR. \nFig. 7.Iteration curves of SRXGWO and GWO when optimizing SVR. \nTable 6 \nEvaluation results of each prediction model. \nModel R2 RMSE MAE \nSRXGWO-SVR 0.99879 159.5753 100.0009 \nGWO-SVR 0.99869 159.5886 100.0069 \nSVR 0.99861 166.1568 105.0999 \nBP 0.99820 584.2596 119.5581 \nRF 0.98379 176.6171 335.1838 \nKELM 0.99819 195.6333 144.1484 \nRBF 0.99865 168.8734 110.3226 \nCNN 0.99744 228.9898 110.3226 X. Zhang et al.\n\n[Página 15]\nComputers in Biology and Medicine 163 (2023) 107166\n15algorithms. \n2. Prediction experiments on the test set \nThe model trained by the real sequence must be closer to the training \nset itself, and there may be problems of false accuracy of the prediction \nresults and overfitting of the prediction model. Moreover, the prediction \nproblem, in reality, will not be the same as the real sequence of the \ntraining set, so it is necessary to simulate and test the completed trained \nmodel by the test set. \nFig. 8 shows the prediction fold of SRXGWO-SVR for the test set. \nAgain, the Original fold represents the data distribution of the test set, \nand the Predicted fold represents the prediction results given by the \nSRXGWO-SVR model. It can be seen that SRXGWO-SVR also predicts \nvery well in the test set prediction with high correlation. However, the \ndeviation of SRXGWO-SVR in predicting the test set is more significant \nthan the training set, e.g., the deviation of the dashboard on days \n7,10,13,36 is larger. Therefore, overall, SRXGWO-SVR still has a highly \naccurate prediction performance and does not fall into the overfitting \nproblem when faced with brand-new patient-flow data. However, it \ncannot achieve the results in training. \nTo further explore the performance of SRXGWO-SVR in the face of \nnew sample sequences and to show the advantages of SRXGWO-SVR \nover other algorithms, the test set experiments also compare \nSRXGWO-SVR with well-known classification prediction models such as \nGWO-SVR, SVR, and BP, and evaluate the prediction results using R2, \nRMSE, and MAE. The evaluation results of each model are shown in \nTable 7. It can be seen that SRXGWO-SVR has higher Spearman corre -\nlation and lower error in RMSE, MAE for prediction results compared \nwith GWO-SVR, SVR, which indicates that SRXGWO-SVR still has an \nadvantage over the unimproved GWO-SVR and SVR in the face of new \ndata sets. In addition, it can be seen that SRXGWO-SVR still has a greater \nadvantage over BP, RF, KELM, RBF, and CNN classical models, and \nperforms better in terms of R2, RMSE, and MAE. \nFinally, this paper combines the prediction results of the training set \nand the test set for statistical comparisons in order to further highlight \nthe significance of the training set experiments and the test set experi -\nments, as well as to demonstrate the prediction effectiveness of \nSRXGWO-SVR for various data sets and the advantages of SRXGWO-SVR \nover other algorithms. The comparison findings are shown in Figs. 9–11, \nwhere the horizontal axis represents each comparison model and the \nvertical axis the assessment standards. Fig. 9 shows that when SRXGWO- SVR is moved from the training set to the test set, the prediction rele-\nvance of the model diminishes and that KELM fluctuates the least. \nHowever, SRXGWO-SVR still outperforms KELM in terms of accuracy, \nsuggesting that it may continue to hold the top spot in future patient- \nflow prediction. The assessment findings were normalized in this \nresearch and then shown once more since RMSE and MAE are prediction \nerrors and the difference between the data is too great. Figs. 10 and 11 \nshow intuitively how much more accurate SRXGWO-SVR is than other \nmodels like BP, RF, CNN, and others. Additionally, even after switching \ndatasets, there is little error variation in the SRXGWO-SVR prediction \nresults, demonstrating the model ’s great stability. It can be shown that \nSRXGWO-SVR is a very accurate, highly generalizable, and highly stable \nprediction model based on the experimental findings of the training and \ntest sets. \n5.Conclusions and future works \nThis paper proposes a high-performance optimization algorithm \nSRXGWO and an effective patient-flow prediction model SRXGWO-SVR, \naiming to predict patients ’ medical needs and achieve orderly patient \naccess by analyzing the changing dynamics and objective laws of \nPatient-flow. First, this paper introduces the current research status of \nartificial intelligence technology for predicting patient-flow and finds \nthat the existing prediction models are not strong in prediction accuracy \nand generalization. Therefore, to improve the accuracy and general -\nization of the prediction model, SRXGWO is proposed based on three \nimprovement strategies and GWO, in which the Sobol sequence im-\nproves the solution space coverage of population initialization, Cauchy \nrandom replacement strategy enhances the information exchange be-\ntween individuals, directional mutation mechanism improves the search \nFig. 8.SRXGWO-SVR predictions for the test set. Table 7 \nEvaluation results of each model based on the test set. \nModel R2 RMSE MAE \nSRXGWO-SVR 0.99835 199.0553 125.6847 \nGWO-SVR 0.99802 199.0954 125.7070 \nSVM 0.99783 218.1971 136.1934 \nBP 0.99738 232.2147 150.2261 \nRF 0.97952 701.2146 427.7865 \nKELM 0.99819 291.1310 185.8860 \nRBF 0.99831 201.5883 129.3960 \nCNN 0.98132 628.8679 363.9654 X. Zhang et al.\n\n[Página 16]\nComputers in Biology and Medicine 163 (2023) 107166\n16ability of the algorithm and the ability to jump out of the local optimum. \nThen, the SRXGWO-SVR prediction model is proposed by combining the \nhigh-performance SRXGWO algorithm with the SVR prediction method \nto accurately predict the number of patients and reasonably schedule \nmedical resources. In the experimental part, ablation experiments are \nfirst conducted to compare SRXGWO with GWO combined with different \nmechanisms. It is verified that SRXGWO, with three improved strategies, \nsimultaneously is the strongest performance. Then, SRXGWO is \ncompared with 12 highly cited algorithms, such as PSO, SCA, etc., by 30 \nbenchmark functions to demonstrate that SRXGWO is also superior in \nthe search ability and exploitation ability of peer algorithms. Finally, a \nreal patient-flow dataset is used to validate the prediction ability of the SRXGWO-SVR model. Comparing with the other seven prediction \nmodels, such as BP, CNN, etc., and evaluating R2, RMSE, and MAE, it is \nproved that the prediction results of SRXGWO-SVR are more accurate, \neffective and stronger than other models. \nOf course, the research in this paper also has some limitations. For \nexample, three improvement mechanisms were added to GWO, which \nincreased the algorithm ’s complexity. In the future, we will try to solve \nthis problem using parallel techniques and high-performance com-\nputers. In addition, in future work, we will further enhance SRXGWO \nand SRXGWO-SVR and apply them to more fields. \nFig. 9.R2 comparison results based on two dataset models. \nFig. 10.Comparison results of RMSE based on two dataset models. \nFig. 11.Comparison results of MAE based on two dataset models. X. Zhang et al.\n\n[Página 17]\nComputers in Biology and Medicine 163 (2023) 107166\n17Declaration of competing interest \nThe authors declare that there is no conflict of interests regarding the \npublication of article. \nReferences \n[1]L. Zhang, L. Li, Study on the Equilibrium of Spatial Allocation of Medical Resources \nat Different Levels in Shanghai, Urban Studies, 2019, p. 26. \n[2]D.Y. Zhou, L.Y. Gao, Q.H. Pan, M.F. He, The Impacts of Medical Resources on \nEmerging Self-Limiting Infectious Diseases, vol. 12, Applied Sciences-Basel, 2022 . \n[3]H. Li, D.M. Mu, P. Wang, Y. Li, D.X. Wang, Prediction of obstetric patient flow and \nhorizontal allocation of medical resources based on time series analysis, Front. \nPublic Health 9 (2021) . \n[4]A. Nikakhtar, S.A. Abbasian-Hosseini, H. Gazula, S.M. Hsiang, Social Network \nbased sensitivity analysis for patient flow using computer simulation, Comput. Ind. \nEng. 88 (2015) 264–272. \n[5]A.R. Sharafat, M. Bayati, PatientFlowNet: a deep learning approach to patient flow \nprediction in emergency departments, IEEE Access 9 (2021) 45552 –45561 . \n[6]M. Tavakoli, R. Tavakkoli-Moghaddam, R. Mesbahi, M. Ghanavati-Nejad, \nA. Tajally, Simulation of the COVID-19 patient flow and investigation of the future \npatient arrival using a time-series prediction model: a real-case study, Med. Biol. \nEng. Comput. 60 (2022) 969–990. \n[7]S. Mirjalili, S.M. Mirjalili, A. Lewis, Grey wolf optimizer, Adv. Eng. Software 69 \n(2014) 46–61. \n[8]X.-S. Yang, A new metaheuristic bat-inspired algorithm, in: J.R. Gonz ˘alez, D. \nA. Pelta, C. Cruz, G. Terrazas, N. Krasnogor (Eds.), Nature Inspired Cooperative \nStrategies for Optimization (NICSO 2010), Springer Berlin Heidelberg, Berlin, \nHeidelberg, 2010, pp. 65–74. \n[9]R. Storn, K.J.J.o.G.O. Price, Differential evolution – a simple and efficient heuristic \nfor global, Optimization over Continuous Spaces 11 (1997) 341–359. \n[10] S. Mirjalili, SCA, A Sine Cosine Algorithm for solving optimization problems, \nKnowl. Base Syst. 96 (2016) 120–133. \n[11] S. Mirjalili, A.H. Gandomi, S.Z. Mirjalili, S. Saremi, H. Faris, S.M. Mirjalili, Salp \nSwarm Algorithm: a bio-inspired optimizer for engineering design problems, Adv. \nEng. Software 114 (2017) 163–191. \n[12] S. Mirjalili, A. Lewis, The whale optimization algorithm, Adv. Eng. Software 95 \n(2016) 51–67. \n[13] S. Mirjalili, Moth-flame optimization algorithm: a novel nature-inspired heuristic \nparadigm, Knowl. Base Syst. 89 (2015) 228–249. \n[14] J. Kennedy, R. Eberhart, Particle swarm optimization, in: Proceedings of ICNN ’95 \nvol. 1944, International Conference on Neural Networks, 1995, pp. 1942 –1948 . \n[15] Y. Yang, H. Chen, A.A. Heidari, A.H. Gandomi, Hunger games search: visions, \nconception, implementation, deep analysis, perspectives, and towards performance \nshifts, Expert Syst. Appl. 177 (2021), 114864 . \n[16] A.A. Heidari, S. Mirjalili, H. Faris, I. Aljarah, M. Mafarja, H. Chen, Harris hawks \noptimization: algorithm and applications, Future Generation Computer Systems- \nthe International Journal of Escience 97 (2019) 849–872. \n[17] H. Su, D. Zhao, A. Asghar Heidari, L. Liu, X. Zhang, M. Mafarja, H. Chen, RIME: A \nPhysics-Based Optimization, Neurocomputing, 2023 . \n[18] J. Tu, H. Chen, M. Wang, A.H. Gandomi, The colony predation algorithm, Journal \nof Bionic Engineering 18 (2021) 674–710. \n[19] I. Ahmadianfar, A. Asghar Heidari, A.H. Gandomi, X. Chu, H. Chen, RUN beyond \nthe metaphor: an efficient optimization algorithm based on Runge Kutta method, \nExpert Syst. Appl. (2021), 115079 . \n[20] I. Ahmadianfar, A. Asghar Heidari, S. Noshadian, H. Chen, A.H. Gandomi, INFO: an \nefficient optimization algorithm based on weighted mean of vectors, Expert Syst. \nAppl. (2022), 116516 . \n[21] H. Chen, C. Li, M. Mafarja, A.A. Heidari, Y. Chen, Z. Cai, Slime mould algorithm: a \ncomprehensive review of recent variants and applications, Int. J. Syst. Sci. (2022) \n1–32. \n[22] S. Li, H. Chen, M. Wang, A.A. Heidari, S. Mirjalili, Slime mould algorithm: a new \nmethod for stochastic optimization, Future Generat. Comput. Syst. 111 (2020) \n300–323. \n[23] M. Abd Elaziz, D. Oliva, S. Xiong, An improved opposition-based sine cosine \nalgorithm for global optimization, Expert Syst. Appl. 90 (2017) 484–500. \n[24] C. Qu, Z. Zeng, J. Dai, Z. Yi, W. He, A modified sine-cosine algorithm based on \nneighborhood search and greedy Levy mutation, Comput. Intell. Neurosci. (2018), \n2018) 4231647-4231647 . \n[25] A.A. Heidari, R. Ali Abbaspour, H. Chen, Efficient boosted grey wolf optimizers for \nglobal search and kernel extreme learning machine training, Appl. Soft Comput. 81 \n(2019), 105521 . \n[26] M.A. Elhosseini, A.Y. Haikal, M. Badawy, N. Khashan, Biped robot stability based \non an A–C parametric Whale Optimization Algorithm, Journal of Computational \nScience 31 (2019) 17–32. \n[27] H. Chen, S. Li, A.A. Heidari, P. Wang, J. Li, Y. Yang, M. Wang, C. Huang, Efficient \nmulti-population outpost fruit fly-driven optimizers: framework and advances in \nsupport vector machines, Expert Syst. Appl. (2020) 142. \n[28] H. Nenavath, R.K. Jatoth, Hybridizing sine cosine algorithm with differential \nevolution for global optimization and object tracking, Appl. Soft Comput. 62 \n(2018) 1019 –1043 . \n[29] Y. Zhang, R. Liu, A.A. Heidari, X. Wang, Y. Chen, M. Wang, H. Chen, Towards \naugmented kernel extreme learning models for bankruptcy prediction: algorithmic \nbehavior and comprehensive analysis, Neurocomputing 430 (2021) 185–212. [30] Y. Liu, A.A. Heidari, Z. Cai, G. Liang, H. Chen, Z. Pan, A. Alsufyani, S. Bourouis, \nSimulated annealing-based dynamic step shuffled frog leaping algorithm: optimal \nperformance design and feature selection, Neurocomputing 503 (2022) 325–362. \n[31] Y. Xue, B. Xue, M. Zhang, Self-adaptive particle swarm optimization for large-scale \nfeature selection in classification, ACM Trans. Knowl. Discov. Data 13 (2019) 1–27. \n[32] Y. Xue, X. Cai, F. Neri, A multi-objective evolutionary algorithm with interval \nbased initialization and self-adaptive crossover operator for large-scale feature \nselection in classification, Appl. Soft Comput. 127 (2022), 109420 . \n[33] X. Wang, X. Dong, Y. Zhang, H. Chen, Crisscross Harris hawks optimizer for global \ntasks and feature selection, Journal of Bionic Engineering (2022) 1–22. \n[34] W. Shan, H. Hu, Z. Cai, H. Chen, H. Liu, M. Wang, Y. Teng, Multi-strategies boosted \nmutative crow search algorithm for global tasks: cases of continuous and discrete \noptimization, Journal of Bionic Engineering 19 (2022) 1830 –1849 . \n[35] R. Dong, H. Chen, A.A. Heidari, H. Turabieh, M. Mafarja, S. Wang, Boosted kernel \nsearch: framework, analysis and case studies on the economic emission dispatch \nproblem, Knowl. Base Syst. 233 (2021), 107529 . \n[36] C. Zhao, Y. Zhou, X. Lai, An integrated framework with evolutionary algorithm for \nmulti-scenario multi-objective optimization problems, Inf. Sci. 600 (2022) \n342–361. \n[37] W. Deng, J. Xu, X.Z. Gao, H. Zhao, An enhanced MSIQDE algorithm with novel \nmultiple strategies for global optimization problems, IEEE Transactions on \nSystems, Man, and Cybernetics: Systems 52 (2022) 1578 –1587 . \n[38] G. Sun, R. Han, L. Deng, C. Li, G. Yang, Hierarchical Structure-Based Joint \nOperations Algorithm for Global Optimization, Swarm and Evolutionary \nComputation, 2023, 101311 . \n[39] K. Yu, D. Zhang, J. Liang, K. Chen, C. Yue, K. Qiao, L. Wang, A correlation-guided \nlayered prediction approach for evolutionary dynamic multiobjective \noptimization, IEEE Trans. Evol. Comput. (2022), 1-1. \n[40] G. Sun, G. Yang, G. Zhang, Two-level parameter cooperation-based population \nregeneration framework for differential evolution, Swarm Evol. Comput. 75 \n(2022), 101122 . \n[41] C. Li, G. Sun, L. Deng, L. Qiao, G. Yang, A population state evaluation-based \nimprovement framework for differential evolution, Inf. Sci. 629 (2023) 15–38. \n[42] G. Sun, C. Li, L. Deng, An adaptive regeneration framework based on search space \nadjustment for differential evolution, Neural Comput. Appl. 33 (2021) 9503 –9519 . \n[43] X. Wen, K. Wang, H. Li, H. Sun, H. Wang, L. Jin, A two-stage solution method based \non NSGA-II for Green Multi-Objective integrated process planning and scheduling \nin a battery packaging machinery workshop, Swarm Evol. Comput. 61 (2021), \n100820 . \n[44] G. Wang, E. Fan, G. Zheng, K. Li, H. Huang, Research on Vessel Speed Heading and \nCollision Detection Method Based on AIS Data, Mobile Information Systems, 2022 . \n[45] Y. Xue, Y. Tong, F. Neri, An ensemble of differential evolution and Adam for \ntraining feed-forward neural networks, Inf. Sci. 608 (2022) 453–471. \n[46] J. Chen, Z. Cai, H. Chen, X. Chen, J. Escorcia-Gutierrez, R.F. Mansour, M. Ragab, \nRenal pathology images segmentation based on improved cuckoo search with \ndiffusion mechanism and adaptive beta-hill climbing, Journal of Bionic \nEngineering (2023) . \n[47] Y. Han, W. Chen, A.A. Heidari, H. Chen, Multi-verse optimizer with rosenbrock and \ndiffusion mechanisms for multilevel threshold image segmentation from COVID-19 \nchest X-ray images, Journal of Bionic Engineering 20 (2023) 1198 –1262 . \n[48] J. Xing, H. Zhao, H. Chen, R. Deng, L. Xiao, Boosting whale optimizer with quasi- \noppositional learning and Gaussian barebone for feature selection and COVID-19 \nimage segmentation, Journal of Bionic Engineering 20 (2023) 797–818. \n[49] H. Hu, W. Shan, J. Chen, L. Xing, A.A. Heidari, H. Chen, X. He, M. Wang, Dynamic \nindividual selection and crossover boosted forensic-based investigation algorithm \nfor global optimization and feature selection, Journal of Bionic Engineering \n(2023) . \n[50] X. Wang, X. Dong, Y. Zhang, H. Chen, Crisscross Harris hawks optimizer for global \ntasks and feature selection, Journal of Bionic Engineering 20 (2023) 1153 –1174 . \n[51] C. Lin, P. Wang, A.A. Heidari, X. Zhao, H. Chen, A boosted communicational salp \nswarm algorithm: performance optimization and comprehensive analysis, Journal \nof Bionic Engineering 20 (2023) 1296 –1332 . \n[52] C. Lin, P. Wang, X. Zhao, H. Chen, Double mutational salp swarm algorithm: from \noptimal performance design to analysis, Journal of Bionic Engineering 20 (2023) \n184–211. \n[53] J. Hu, S. Lv, T. Zhou, H. Chen, L. Xiao, X. Huang, L. Wang, P. Wu, Identification of \npulmonary hypertension animal models using a new evolutionary machine \nlearning framework based on blood routine indicators, Journal of Bionic \nEngineering 20 (2023) 762–781. \n[54] J. Liang, K. Qiao, K. Yu, B. Qu, C. Yue, W. Guo, L. Wang, Utilizing the relationship \nbetween unconstrained and constrained pareto fronts for constrained \nmultiobjective optimization, IEEE Trans. Cybern. (2022) 1–14. \n[55] C. Huang, X. Zhou, X. Ran, Y. Liu, W. Deng, W. Deng, Co-evolutionary competitive \nswarm optimizer with three-phase for large-scale complex optimization problem, \nInf. Sci. 619 (2023) 2–18. \n[56] J.S. Chou, J.P.P. Thedja, Metaheuristic optimization within machine learning- \nbased classification system for early warnings related to geotechnical problems, \nAutom. ConStruct. 68 (2016) 65–80. \n[57] A. Kaushik, N. Singal, A hybrid model of wavelet neural network and metaheuristic \nalgorithm for software development effort estimation, Int. J. Inf. Technol. 14 \n(2022) 1689 –1698 . \n[58] M. Mehraein, A. Mohanavelu, S.R. Naganna, C. Kulls, O. Kisi, Monthly Streamflow \nPrediction by Metaheuristic Regression Approaches Considering Satellite \nPrecipitation Data, vol. 14, Water, 2022 . X. Zhang et al.\n\n[Página 18]\nComputers in Biology and Medicine 163 (2023) 107166\n18[59] K. Zhu, S. Ying, N.N. Zhang, D.D. Zhu, Software defect prediction based on \nenhanced metaheuristic feature selection optimization and a hybrid deep neural \nnetwork, J. Syst. Software 180 (2021) . \n[60] J.S. Chou, K.H. Yang, J.P. Pampang, P. Anh-Duc, Evolutionary metaheuristic \nintelligence to simulate tensile loads in reinforcement for geosynthetic-reinforced \nsoil structures, Comput. Geotech. 66 (2015) 1–15. \n[61] J.W. Ma, D. Xia, H.X. Guo, Y.K. Wang, X.X. Niu, Z.Y. Liu, S. Jiang, Metaheuristic- \nbased support vector regression for landslide displacement prediction: a \ncomparative study, Landslides 19 (2022) 2489 –2511 . \n[62] N.D. Hoang, D.T. Bui, L. Kuo-Wei, Groutability estimation of grouting processes \nwith cement grouts using differential flower pollination optimized support vector \nmachine, Appl. Soft Comput. 45 (2016) 173–186. \n[63] S. García, A. Fern˘andez, J. Luengo, F. Herrera, Advanced nonparametric tests for \nmultiple comparisons in the design of experiments in computational intelligence \nand data mining: experimental analysis of power, Inf. Sci. 180 (2010) 2044 –2064 . \n[64] J. Derrac, S. García, D. Molina, F. Herrera, A practical tutorial on the use of \nnonparametric statistical tests as a methodology for comparing evolutionary and \nswarm intelligence algorithms, Swarm Evol. Comput. 1 (2011) 3–18. \n[65] C. Zhao, H. Wang, H. Chen, W. Shi, Y. Feng, JAMSNet: a remote pulse extraction \nnetwork based on joint attention and multi-scale fusion, IEEE Trans. Circ. Syst. \nVideo Technol. (2022), 1-1. [66] J. Lv, G. Li, X. Tong, W. Chen, J. Huang, C. Wang, G. Yang, Transfer learning \nenhanced generative adversarial networks for multi-channel MRI reconstruction, \nComput. Biol. Med. 134 (2021), 104504 . \n[67] X. Xue, G. Li, D. Zhou, Y. Zhang, L. Zhang, Y. Zhao, Z. Feng, L. Cui, Z. Zhou, X. Sun, \nResearch roadmap of service ecosystems: a crowd intelligence perspective, \nInternational Journal of Crowd Science 6 (2022) 195–222. \n[68] X. Xue, X.-N. Yu, D.-Y. Zhou, X. Wang, Z.-B. Zhou, F.-Y. Wang, Computational \nExperiments: Past, Present and Future, 2022 arXiv preprint arXiv:2202.13690 . \n[69] X. Xue, X. Yu, D. Zhou, C. Peng, X. Wang, D. Liu, F.-Y. Wang, Computational \nexperiments for complex social systems —Part III: the docking of domain models, \nIEEE Transactions on Computational Social Systems (2023) . \n[70] X. Cao, T. Cao, Z. Xu, B. Zeng, F. Gao, X. Guan, Resilience constrained scheduling of \nmobile emergency resources in electricity-hydrogen distribution network, IEEE \nTrans. Sustain. Energy (2022) 1–15. \n[71] Y. Dai, J. Wu, Y. Fan, J. Wang, J. Niu, F. Gu, S. Shen, MSEva: a musculoskeletal \nrehabilitation evaluation system based on EMG signals, ACM Trans. Sens. Netw. 19 \n(2022) 1–23. \n[72] J. Zhou, X. Zhang, Z. Jiang, Recognition of imbalanced epileptic EEG signals by a \ngraph-based extreme learning machine, Wireless Commun. Mobile Comput. 2021 \n(2021), 5871684 . X. Zhang et al.",
+ "f81bcb0f-9019-422d-8eb6-9215a5ab70ba": {
+ "content": "Computers in Biology and Medicine 163 (2023) 107166\nAvailable online 9 June 2023\n0010-4825/© 2023 Elsevier Ltd. All rights reserved.An enhanced grey wolf optimizer boosted machine learning prediction \nmodel for patient-flow prediction \nXiang Zhanga, Bin Lub, Lyuzheng Zhangc, Zhifang Pand, Minjie Liaoa, Huihui Shena, \nLi Zhange, Lei Liuf, Zuxiang Lig,*, YiPao Huh,**, Zhihong Gaoi,*** \naWenzhou Data Management and Development Group Co.,Ltd, Wenzhou, Zhejiang, 325000, China \nbWenzhou City Bureau of Justice, Wenzhou, Zhejiang, 325000, China \ncB-soft Co.,Ltd., B-soft Wisdom Building, No.92 Yueda Lane, Binjiang District, Hangzhou, 310052, China \ndThe First Affiliated Hospital of Wenzhou Medical University, Wenzhou, 325000, China \neWenzhou Hongsheng Intellectual Property Agency (General Partnership), Wenzhou, Zhejiang, 325000, China \nfCollege of Computer Science, Sichuan University, Chengdu, Sichuan, 610065, China \ngOrganization Department of the Party Committee, Wenzhou University, Wenzhou, 325000, China \nhWenzhou Health Commission, Wenzhou, Zhejiang, 325000, China \niZhejiang Engineering Research Center of Intelligent Medicine, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, 325000, China \nARTICLE INFO \nKeywords: \nPatient-flow prediction \nSupport vector regression \nMachine learning \nMeta-heuristic \nSwarm-intelligence ABSTRACT \nLarge and medium-sized general hospitals have adopted artificial intelligence big data systems to optimize the \nmanagement of medical resources to improve the quality of hospital outpatient services and decrease patient \nwait times in recent years as a result of the development of medical information technology and the rise of big \nmedical data. However, owing to the impact of several elements, including the physical environment, patient, \nand physician behaviours, the real optimum treatment effect does not meet expectations. In order to promote \norderly patient access, this work provides a patient-flow prediction model that takes into account shifting dy-\nnamics and objective rules of patient-flow to handle this issue and forecast patients ’ medical requirements. First, \nwe propose a high-performance optimization method (SRXGWO) and integrate the Sobol sequence, Cauchy \nrandom replacement strategy, and directional mutation mechanism into the grey wolf optimization (GWO) al-\ngorithm. The patient-flow prediction model (SRXGWO-SVR) is then proposed using SRXGWO to optimize the \nparameters of support vector regression (SVR). Twelve high-performance algorithms are examined in the \nbenchmark function experiments ’ ablation and peer algorithm comparison tests, which are intended to validate \nSRXGWO ’s optimization performance. In order to forecast independently in the patient-flow prediction trials, the \ndata set is split into training and test sets. The findings demonstrated that SRXGWO-SVR outperformed the other \nseven peer models in terms of prediction accuracy and error. As a result, SRXGWO-SVR is anticipated to be a \nreliable and efficient patient-flow forecast system that may help hospitals manage medical resources as effec-\ntively as possible. \n1.Introduction \nPrimary medical care is the guarantee of people ’s survival and \ndevelopment. With the continuous development of economic, cultural, \nand social construction, people ’s demand for medical resources is much higher. Their awareness of medical and health care also increases re-\nquirements for the current medical industry. Since the medical service \nsystem is complex, it is not only influenced by factors such as local de-\nmographic characteristics, socio-economic conditions, natural environ -\nmental conditions, medical hardware, software facilities, and patient \n*Corresponding author. \n**Corresponding author. \n***Corresponding author. \nE-mail addresses: zhxan@126.com (X. Zhang), wzlubin@139.com (B. Lu), 66199293@qq.com (L. Zhang), panzhifang@wmu.edu.cn (Z. Pan), 1829820@qq.com \n(M. Liao), ylvias7@126.com (H. Shen), 101744491@qq.com (L. Zhang), liulei.cx@gmail.com (L. Liu), lizuxiang@wzu.edu.cn (Z. Li), huyipao@outlook.com (Y. Hu), \ngzh@wzhospital.cn (Z. Gao). \nContents lists available at ScienceDirect \nComputers in Biology and Medicine \nu{�~zkw! s{yo|kro>! ÐÐÐ1ow�o �to~1m{y2w{m k�o2m{y|lt{ yon!\nhttps://doi.org/10.1016/j.compbiomed.2023.107166 \nReceived 10 March 2023; Received in revised form 25 May 2023; Accepted 8 June 2023 \nComputers in Biology and Medicine 163 (2023) 107166\n2and doctor behaviors [1]. But there are also various interactions and \npositive and negative feedback between these influencing factors, which \nmay result in the longer the waiting time in the hospital, the more \nattractive the patients are, or the regular changes in the hospital waiting \nqueue, etc. Self-organized regularities and Emergent behavior make it \ndifficult for hospitals to implement optimal outpatient management \nmeasures and cause the actual use of available resources not to match \nthe expected results [2]. Therefore, to improve the efficiency of existing \nmedical resources, improve the quality of hospital outpatient services, \nshorten patient waiting queues and waiting times, it is crucial to un-\nderstand the changing dynamics and objective patterns of patient-flow \nto provide a basis for dynamic adjustment of physician consultation \nplans and to achieve orderly and effective patient control. \nIn recent years, the advancement of medical informatization and the \nrise of big medical data has allowed studying patient-flow prediction \nbased on big data mining. Researchers have conducted some research in \nthe analysis of patient-flow change patterns, analysis of patient-flow \ninfluencing factors, and patient-flow prediction. Li et al. [3] proposed \na time series patient-flow prediction method based on XGBoost, a sup-\nport vector machine (SVM), to solve the problem of planning and allo-\ncation of healthcare resources by government and hospital management. \nNikakhtar et al. [4] proposed a patient visit prediction model based on \neigendistance and mesocentricity that can help healthcare managers and \ndecision-makers predict the trend of infectious patient-flow. Sharafat \net al. [5] proposed an emergency room patient-flow prediction model \n(PatientFlowNet) based on a deep learning framework, including pre-\ndicting arrival, treatment, and discharge rates. The results show that \nPatientFlowNet has higher accuracy and lower average absolute error \nthan the benchmark algorithm. Tavakoli et al. [6] proposed a seasonal \nautoregressive integrated moving average (SARIMA) model for \npatient-flow prediction of the current epidemic of neocrown pneumonia \ndisease, effectively predicting the number of patients’ visits to Thai \nhospitals in the next 30. According to the current research status, it is \neasy to find that more and more researchers are using machine learning \ntechniques to predict the number of patient visits in hospitals. However, \nsince most of the prediction models use a monadic time-series feature \nprediction method and the changes of patient-flow are affected by a \nvariety of complex factors and do not have obvious linear characteris -\ntics, resulting in the accuracy of the models is not high. On the other \nhand, it is limited by the defects of the classification predictor itself, \nwhich leads to large prediction bias of prediction models based on SVM \nand other prediction models. Therefore, how to improve the accuracy \nand reduce the error of patient-flow prediction models is a major chal-\nlenge in current medical resource scheduling research. \nAs a novel optimization method with strong robustness and flexi-\nbility, the swarm intelligence optimization algorithm is widely used in \npredictive optimization problems. The swarm intelligence optimization \nalgorithm is a stochastic optimization algorithm abstracted by simu-\nlating the collaborative behavior of animals, insects, and other organ -\nisms. The current well-known algorithms are, grey wolf optimization \n(GWO) [7], bat-inspired algorithm (BA) [8], different evolution (DE) \n[9], sine cosine algorithm (SCA) [10], salp swarm algorithm (SSA) [11], \nwhale optimizer (WOA) [12], moth-flame optimization (MFO) [13], \nparticle swarm optimization (PSO) [14], hunger games search (HGS) \n[15], Harris hawks optimization (HHO) [16], rime optimization algo-\nrithm (RIME) [17], colony predation algorithm (CPA) [18], Runge Kutta \noptimizer (RUN) [19], weighted mean of vectors (INFO) [20], slime \nmould algorithm (SMA) [21,22], opposition-based SCA (OBSCA) [23], \nmodified SCA (m_SCA) [24], boosted GWO (OBLGWO) [25], A-C para-\nmetric WOA (ACWOA) [26], fruit fly optimizer (FOA) with \nmulti-population outpost mechanism (MOFOA) [27], SCA with differ -\nential evolution (SCADE) [28], and so on. They also have been applied to \nsolve many problems such as bankruptcy prediction [29], feature se-\nlection [30–34], economic emission dispatch [35], multi-objective \noptimization [36], global optimization [37,38], dynamic \nmulti-objective optimization [39], numerical optimization [40–42], scheduling optimization [43,44], feed-forward neural networks [45], \nmedical image segmentation [46–48], feature selection [49,50], per-\nformance optimization [51,52], identification of pulmonary hyperten -\nsion animal [53], constrained multi-objective optimization [54], and \nlarge-scale complex optimization [55]. \nMore and more researchers are considering optimizing models using \nswarm intelligence optimization methods to improve the accuracy of \nprediction methods. Chou et al. [56] proposed a swarm intelligence \nalgorithm-based support vector machine prediction model (SFALSSVM) \nusing the smart firefly algorithm (SFA) to optimize the parameters of the \nleast squares support vector regression (SVR) and successfully applied it \nto several geotechnical engineering problems. Kaushik et al. [57] pro-\nposed a binary swarm intelligence algorithm by combining the firefly \nalgorithm and bat algorithm with a wavelet neural network (WNN) and \noffered a prediction model for software development effort (SDEE), \nwhich has high prediction accuracy. Mehraein et al. [58] proposed a \nCatBoost (CB) prediction model based on a swarm intelligence algorithm \nfor predicting the monthly flow of satellite precipitation data and \ndemonstrated a significant reduction in the root mean square error \n(RMSE) of the proposed CB compared with an artificial neural network \n(ANN). Zhu et al. [59] combined the WOA and the simulated annealing \nalgorithm (SA) to optimize the kernel extreme learning machine \n(KELM). They proposed an enhanced search-based prediction algorithm \n(EMWS) that effectively addresses defect prediction in software \nmodules. \nZhou et al. [60] improved the Firefly algorithm (FA) by incorpo -\nrating chaotic mapping, adaptive inertia weights, and Levy flight for \naccurate prediction of reinforcement tensile loads for assessing the in-\nternal stability of geosynthetic reinforced soil (GRS) structures. They \nused the improved FA to optimize the hyperparameters of the \nleast-squares SVR model. The improved SVR model had excellent ac-\ncuracy with an average absolute percentage error of less than 10%. Ma \net al. [61] proposed an SVR prediction model integrated with k-fold \ncross-validation (CV) and used an artificial bee colony (ABC) algorithm \nand genetic algorithm (GA) to optimize the hyperparameters of the \nmodel. The results showed that the hybrid approach can be used to \ndetermine the optimal hyperparameters and present statistical signifi -\ncance. Huang et al. [62] proposed a swarm intelligence algorithm (DFP) \nintegrating floral pollination algorithm (FPA) and differential evolution \n(DE) and an algorithmic model for predicting the groutability of cement \npaste in combination with SVR. Luo et al. proposed a hybrid prediction \nmodel (LS-SVMR) using a coupled simulated annealing (CSA) algorithm \nto optimize the hyperparameter selection of SVR, which effectively \nimplemented the lateral strength prediction of reinforced concrete (RC) \ncolumns. \nBased on the above improvement methods for prediction models, it \ncan be found that swarm intelligence optimization algorithms can \neffectively help prediction models find optimal hyperparameters, and \nSVR is applied very frequently in many models. However, due to the \nvariety of swarm intelligence algorithms, each algorithm has defects, \nsuch as low convergence accuracy, slow search speed, and easy falling \ninto local optimality. Therefore, in this paper, to accurately predict the \nnumber of patients and reasonably schedule medical resources, an SVR \nprediction model based on improved GWO is proposed using the GWO \nalgorithm with high exploitation capability combined with SVR pre-\ndiction methods. First, to give full play to the exploitation advantages of \nGWO and overcome the shortcomings of GWO in the search process as \nmuch as possible, the following three methods are used for improve -\nment: (1) To address the problem of narrow coverage of the initialized \nsearch agent of GWO, the original random initialization method is used \ninstead of Sobol sequence to expand the distribution of the initial so-\nlution. (2) To address the problem of too little information exchange \namong GWO search agents, a directional mutation mechanism is used to \nincrease the interactivity of solutions, improving the algorithm’s search \nefficiency. (3) To address the problem of imbalance between GWO \nsearch and exploitation, a Cauchy random replacement strategy is added X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n3to the core update formula to adjust the weights of search and exploi -\ntation of the algorithm in the iterative process. Based on the above ideas, \nSobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism are intro-\nduced into GWO to propose a high-performance GWO variant \n(SRXGWO). Then, to verify the optimization performance of SRXGWO, \nthis paper designs comparative simulation experiments based on the \nclassical IEEE CEC2014 test set and compares SRXGWO with other X \nmethods. The experiments show that the proposed SRXGWO method \nsignificantly improves initialization, search efficiency, and defects of \niterative balance. This paper also analyzes the comparative results using \nthe Wilcoxon signed-rank test [63] and the Friedman test [64]. \nSRXGWO has a higher convergence speed compared with peer algo-\nrithms and accuracy. \nFurther, this paper proposes a multivariate SRXGWO-SVR prediction \nmodel for predicting patient flow by optimizing two hyperparameters of \nSVR using high-performance SRXGWO. To validate the real prediction \nability of the SRXGWO-SVR model, the prediction results of the model \nare presented in detail using real clinical data sets and divided into \ntraining and test sets. Further, the SRXGWO-SVR model based on \nSRXGWO, the GWO-SVR model based on GWO, and the original SVR \nmodel are compared in this paper, and the experimental results also \ndemonstrate that the SRXGWO-SVR can effectively outperform the two \noriginal models without improvement. Finally, this paper also compares \nthe SRXGWO-SVR model with well-known prediction models such as \nRadial basis function networks, convolutional neural networks, etc. R- \nsquared (R2), root mean squared error (RMSE), and mean absolute error \n(MAE) are used for validation and confirm that SRXGWO-SVR is more \nadvantageous in predicting hospital patient-flow. The data set used in \nthis paper is the attendance statistics of Wenzhou Medical University \nHospital in China, which serves a radius of nearly 30 million people and \nhas an annual outpatient volume of 5.3 million. Due to the large volume \nof data, the latest data from January 2022 to September 2022 is selected, \nwith a sample size of 240 items. The main contributions of this paper are \nas follows. \n1. Sobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism are \nintroduced into GWO to propose a high-performance algorithm \nSRXGWO. The strategies and mechanisms employed in this paper can \nprovide a valid reference for the field of evolutionary computation. \n2.We designed experiments comparing SRXGWO with 12 similar al-\ngorithms to verify the algorithm ’s improvement ideas and optimi -\nzation performance. Experiments can effectively demonstrate the \nperformance of SRXGWO ’s benchmark functions and provide illus-\ntrations for their specific applications. \n3. SRXGWO is used to optimize the hyperparameters of SVR, and the \nSRXGWO-SVR multivariate prediction model is proposed and suc-\ncessfully applied to predict patient flow. The proposed model can \neffectively predict patient flow and provide useful suggestions for \nhospital management. \n4. We designed a comparison experiment between SRXGWO-SVR and \neight similar prediction models to verify the effectiveness of the \nimprovement and the accuracy of the prediction. The experiments \nillustrate that the proposed model has great potential for predicting \nother time series problems. \nThe rest of this paper is organized as follows. Section 2 describes the \nprediction dataset, the original GWO, and SVR. In Section 3, SRXGWO is \nproposed based on three improvement strategies, and the SRXGWO-SVR \nmodel is proposed in conjunction with SVR. In Section 4, benchmark \nfunction comparison experiments and simulation prediction comparison \nexperiments are designed. Finally, Section 5 summarizes the work of this \npaper and illustrates further research directions. 2.Materials and methods \nThis section introduces the swarm intelligence optimization algo-\nrithm GWO and the regression prediction model SVR used in this study. \n2.1. Description of GWO algorithm \nIn the GWO algorithm, grey wolf individuals are divided into four \nclasses: α、β、δ and ω. α is mainly responsible for participating in the \ndecision-making and management of the pack; ω is for other grey wolf \nindividuals; β and δ are for grey wolf individuals with the second highest \nadaptation level to α. The GWO algorithm focuses on three behaviors: \nencirclement behavior, hunting behavior, and attack behavior. \n1. Encirclement behavior \nThe first stage of prey predation by grey wolves is to encircle the \nprey, and the mathematical model can be described by Eq. (1) and Eq. \n(2). \nD↗⃦⃦⃦⃦C↗⋅X↗\np
t\u0000X↗
t⃦⃦⃦⃦(1) \nX↗
t1X↗\nv
t\u0000A↗⋅D↗(2) \nwhere D↗is the distance between the prey and the wolves; A↗2a⋅r2\u0000\na, C↗2⋅r↗\n2; X↗is the current location of the wolves; t is the number of \ncurrent iterations; X↗\np is the location of the prey; r1 , r2 are random \nnumbers, between 0C1; a∃2C0. \n2. Hunting behavior \nAfter a wolf pack surrounds a prey, it will hunt the surrounding prey. \nIf α is the global optimal solution, β is the global second solution, and δ is \nthe global third solution, then the mathematical model of α, β, and δ \nrepositioning can be described by Eqs. (3)–(5). \nD↗\nα⃦⃦⃦⃦C↗\n1⋅X↗\nα\u0000X↗⃦⃦⃦⃦(3) \nD↗\nβ⃦⃦⃦⃦C↗\n2⋅X↗\nβ\u0000X↗⃦⃦⃦⃦(4) \nD↗\nδ⃦⃦⃦⃦C↗\n2⋅X↗\nδ\u0000X↗⃦⃦⃦⃦(5) \nwhere D↗\nα, D↗\nβ and D↗\nδ denote the approximate distances of α, βCand δ \nfrom X↗, respectively; X↗\nα, X↗\nβ, X↗\nδ denote the position information of α, \nβ, and δ, respectively; C↗\n1, C↗\n2 and C↗\n3 denote the random vectors, \nrespectively. The current solution X↗and the updated solution X↗
t1\ncan be described by Eq. (6)-Eq. (9). \nX↗\n1X↗\nα\u0000A↗\n1⋅(\nD↗\nα)\n(6) \nX↗\n2X↗\nβ\u0000A↗\n2⋅[\nD↗\nβ]\n(7) \nX↗\n3X↗\nδ\u0000A↗\n3⋅(\nD↗\nδ)\n(8) \nX↗′\n
t1[\nX↗\n1X↗\n2X↗\n3][\n3 (9) \nwhere A↗\n1 , A↗\n2 , and A↗\n3 denote random vectors, respectively. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n43. Attack behavior \nThe final stage of the GWO algorithm is the prey attack phase, which \ncan be achieved by adjusting the parameter A. If †A†≼1, the whole wolf \npack approaches the prey
X∗CY∗and focuses on the prey; if †A†F1, the \nwhole wolf pack moves away from the prey and looks for new prey \nagain. \n2.2. Description of support vector regression \nSupport vector machine (SVM) models are used to classify data by \nmapping the input metric data to a higher dimensional space, then \nconstructing an optimal hyperplane in this higher dimensional space so \nthat the constructed hyperplane has the largest edges to classify the \ninput data. The learning strategy used by the support vector machine is \ninterval maximization, which can be formalized as solving a convex \nquadratic programming problem. \nInstead of the traditional statistical induction followed by deduction, \nthe SVR model constructs a regression function to infer a prediction \nmodel on the training data and then uses the model to make predictions. \nThe objective of SVR modeling is to build a classification surface that \nseparates the two types of samples as well as possible. SVR modeling \naims to minimize the distance between all the sample data and the \nclassification surface. The accuracy of the SVR model is highly depen -\ndent on the kernel function ’s quality and the penalty factor ’s accuracy, \nand the appropriate choice of parameters dramatically improves the \naccuracy of the regression model. When the parameters of the regression \nmodel are not selected appropriately, the regression model will not be \napplicable to solve the actual problem. For the training data, regression \naims to solve the following regression function, as in Eq. (10). \nf
y〈W0y〉b (10) \nThe above equation is 〈w0y〉 is the inner product of w and y. The \nfollowing equation is the constraint to solve the constrained optimiza -\ntion problem: \nMin 1⎡\n2Dw0wFĈm\ni1\u0000\nξiξ∗\ni)\n(11) \nZi\u0000Dw0yiFb≼εξi (12) \nDw0yiF\u0000zib≼εyj\u0000yk (13) \nwhere C represents the penalty factor of the model, the value of C is \npositively related to the complexity of the model, the complexity of the \nmodel increases with the value of C, and the value of C is negatively \nrelated to the computational error of the model, the error of the model \nbecomes smaller as the value of C increases. \nThe solution of the optimization problem is first transformed into the \ncorresponding pairwise problem and, secondly transformed into the \nsolution of the maximum constraint value by introducing the kernel \nfunction. Finally, the regression equation of the model is shown in Eq. \n(14). \nf
ŷm\ni1\u0000\naj\u0000aj)0k\u0000\nyCyj)\nb (14) \n3.The proposed method \nIn this section, three improvement ideas are described, namely, \nSobol sequence-based population initialization, Cauchy random \nreplacement strategy, and directional mutation mechanism. Finally, the \nproposed SRXGWO is used to optimize the hyperparameters of the SVR \nmodel, and the patient-flow prediction model SRXGWO-SVR is \nproposed. 3.1. Proposed GWO variant \n3.1.1. Sobol sequence-based population initialization \nThe population initialization of the original GWO algorithm is \nrandomly generated, which primarily affects the algorithm ’s perfor -\nmance. In contrast, the Sobol sequence can make the spatial points \nuniformly distributed and generate unlimited samples without pre- \ndetermining the number of samples and storing them. Therefore, this \npaper introduces the Sobol sequence to filter the initialization position \nof the grey wolf population, improve the uniformity and diversity of the \ngrey wolf population, and improve the performance of the original GWO \nalgorithm. \nEach dimension of the Sobol sequence is a Radical inversion with \nbase 2, and each dimension has a different generating matrix C. When C \nis taken as a unit vector, the corresponding Sobol sequence is repre -\nsented as \nN
îM\nk12\u0000kak
i (15) \nwhere i is denoted as a binary number, ak
ion each bit of the number is \narranged as a vector, which is mirrored to the right of the decimal point \nand converted to decimal, resulting in a one-dimensional Sobol \nsequence Xi⊔N
1CN
2…CN
iCi∃N⊓, and a multi-dimensional Sobol \nsequence is obtained by multiplying the generating matrix C of each \ndimension. The Sobol sequence is used to uniformly distribute n points \nwithin the threshold of the target parameter search as the initialized \npopulation space location. The first three solutions are defined as α, β, \nand δ wolves, respectively. To confirm the effectiveness of Sobol \nsequence-based population initialization, Ablation experiments of \nSRXGWO are designed in Section 4.1.2 , where SGWO is the improved \nGWO using this strategy alone. \n3.1.2. Cauchy random replacement strategy \nIn the iterative process, the position update of GWO is conservative. \nOn the one hand, such an update is beneficial to the exploitation of the \nalgorithm. Still, on the other hand, it may cause the algorithm to have a \npoor quality of the search solution and fall into local optimum when \ndealing with multi-peaked problems. Therefore, in this paper, to solve \nthis problem, the Cauchy replacement search strategy is used to \nappropriately perturb the dimensionality of the search agent and \nimprove the interaction between individuals. \nSpecifically, firstly, the grey wolf population with the number of \nindividuals N is traversed by the parameter l, and the selected one is the \nXl individual. Then, according to the ratio of the remaining runs of the \nalgorithm to the total number of runs compared with the Cauchy \nrandom number, if the Cauchy random number is less than the ratio, the \nh-th dimensional value of Xl is replaced with the hth dimensional value \nof the optimal solution α wolves. Finally, the fitness value of the updated \nXl The evaluation function calculates the optimal solution, and the \noptimal fitness value are replaced if the fitness value is better than the \noptimal solution. Otherwise, it remains unchanged. To confirm the \neffectiveness of the Cauchy replacement search strategy, RGWO in \nAblation experiments of SRXGWO is the GWO improved using this \nstrategy alone. \n3.1.3. Directional mutation strategy \nSince the original GWO relies too much on the searchability of the \ntop three ranked wolves to find the optimal solution, it is easy to fall into \nthe local optimal trap and reduce the accuracy of the optimal solution. \nTherefore, this paper proposes a directional mutation strategy based on \ngenetic algorithms ’ mutation and crossover strategies. The directional \nmutation strategy consists of two important operations: directional \ncrossover and directional variation. \n1. Directional crossover (DM) X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n5The when-directed crossover mechanism uses the position informa -\ntion of the current iteration ’s optimal individual to guide the in-\ndividual ’s next change trend. There are four main parameters, which are \ncrossover rate (pc), variable crossover probability (pcv), directional \nprobability (pd) and multiplication factor (α). First, the execution of the \ndirected crossover mechanism requires different parent individuals in \nthe current population. The parent individuals are generated by random \nselection from the population, pj\n1 and pj\n2, j∃1Cd]. pj\nmean and pj\nbest are the \nmean value of the parent individuals in the jth dimension and the value \nof the best individual in the jth dimension, respectively. In the first case, \nwhen pj\nbest≽pj\nmean (c1 and c2 does the directed hybridization mechanism \ngenerate the individuals). \nval1\u00000B5e⌈\n†pj\n1\u0000pj\n2†\n
yj\nu\u0000yj\nl⌉\n(16) \nβr3\nα2(17) \nc1val∗\u0000\npj\n1\u0000pj\n2)\nαr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (18) \nc2
1\u0000val∗\u0000\npj\n1\u0000pj\n2)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (19) \nc1val∗\u0000\npj\n1pj\n2)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (20) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\nα
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (21) \nWhen pj\nbestDpj\nmean. \nc1val∗\u0000\npj\n1pj\n2)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (22) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\nα
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4≼pd (23) \nc1val∗\u0000\npj\n1pj\n2)\nαr3∗e
1\u0000β∗
1\u0000val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (24) \nc2
1\u0000val∗\u0000\npj\n1pj\n2)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗⃦⃦pj\n1\u0000pj\n2⃦⃦Cifr4Fpd (25) \nIf the parent individuals have the same value, but pj\nbestℑpj\nmean. \nval1\u00000B5e⌈\n†pj\nbest\u0000pj\nmean†\n
yj\nu\u0000yj\nl⌉\n(26) \nβr3\nα2(27) \nc1val∗\u0000\npj\nbestpj\nmean)\nαr3∗e
1\u0000β∗
1\u0000val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4≼pd\n(28) \nc2
1\u0000val∗\u0000\npj\nbestpj\nmean)\n\u0000α
1\u0000r3∗e
\u0000β∗val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4\n≼pd\n(29) \nc1val∗\u0000\npj\nbestpj\nmean)\n\u0000αr3∗e
1\u0000β∗
1\u0000val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4Fpd\n(30) \nc2
1\u0000val∗\u0000\npj\nbestpj\nmean)\nα
1\u0000r3∗e
\u0000β∗val∗\u0000\npj\nbest\u0000pj\nmean)\nCifr4Fpd\n(31) \nwhere r3 and r4 are two different random numbers, r3∃
0C1and r4∃\n
0C1. val and β are two parameters computed in each iteration. yj\nu and yj\nl \nare the upper and lower bounds of the individual in the jth dimension, \nrespectively. А is the multiplicative factor. \n2. Directional variation First, assume that the dimensions of population size and objective \nfunction are D and d, respectively. Assume that the current iteration \nindividual is y. The guided variation mechanism guides the variation of \nthe current iteration individual y based on the position information of \nthe current optimal individual ybest. When individual y is selected for \nguided mutation operation, the DM mechanism will compare the size of \nyj\ni and yj\nbest, if yj\nbest≽yj\ni. \nβ1e[\n2r\u00002\nr]\n(32) \nβ2e[\nr\u00002\nr]\n(33) \nym|\n〈\n⎜yj\niβ1∗\u0000\nyj\nu\u0000yj\ni)\nCifr2≼pd\nyj\ni\u0000β2∗\u0000\nyj\ni\u0000yj\nl)\nCotherwise(34) \nwhere β1 and β2 are two parameters, which can also be called the \nweights that determine the change steps of the formula. r and r2 are two \nrandom numbers, r∃
0C1and r2∃
0C1, rℑ0. yj\nu and yj\nl are the upper \nand lower bounds of the individual in the jth dimension, respectively. pd \nrepresents the orientation probability, pd∃
0B5C1. If yj\nbestDyj\ni. \nym|\n〈\n⎜yj\ni\u0000β1∗\u0000\nyj\ni\u0000yj\nl)\nCifr2≼pd\nyj\niβ2∗\u0000\nyj\nu\u0000yj\ni)\nCotherwise(35) \nTo illustrate the effectiveness of the Directional mutation strategy, \nthe XGWO in ablation experiments of SRXGWO is the GWO improved \nusing this strategy alone. \n3.1.4. Proposed SRXGWO \nThe analysis shows that GWO is an excellent algorithm with solid \nexploitation capability, but several aspects still need improvement. First, \nGWO is randomly generated with strong uncertainty in the initialization \nof the grey wolf population, which will lead to the initial solution of the \nwhole population cannot effectively cover the solution space of the \nproblem, thus causing problems such as low efficiency in the search \nphase. Secondly, the lack of information exchange among individuals in \nthe iterative process of GWO tends to make the algorithm suffer from \npoor-quality of search solutions and fall into local optimum when \ndealing with multi-peaked problems. In addition, GWO relies too much \non the exploitation ability of the top three ranked wolves to find the \noptimal solution, which cannot effectively search the whole solution \nspace, leading to the inability to find the optimal solution and reducing \nthe quality of understanding. \nTherefore, this paper addresses the above three problems and makes \ncorresponding improvements to GWO. First, Sobol sequence-based \npopulation initialization is used instead of the original random initiali -\nzation method to generate a low-sequence population of grey wolves, \nwhich covers the whole solution space uniformly. Second, the dimen -\nsional values between search agents are effectively exchanged by Cau-\nchy’s random replacement strategy to enhance the information \nexchange between individuals and improve the exploitation capability \nof the algorithm. Third, the directional mutation mechanism is intro-\nduced to perform crossover and mutation at the level of the search so-\nlution, and the crossover or mutation operation is performed for the \nnature of the current individuals, which effectively improves the search \nability of the algorithm and the ability to jump out of the local optimum. \nThe algorithm flowchart of SRXGWO as shown in Fig. 1. \n3.2. The proposed SRXGWO-SVR model \nTo accurately predict the number of patients and reasonably \nschedule medical resources, this section combines the high-performance X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n6SRXGWO algorithm with the SVR prediction method and proposes the \nSRXGWO-SVR, an SVR prediction model based on the improved GWO. \nAccording to Section 2.2, SVR is a supervised machine learning \nmethod with two key parameters: the penalty parameter C and the \nkernel function parameter g. The penalty parameter C affects the \ncomplexity and stability of the model, the kernel function parameter \nreflects the distribution of samples in the feature space, and the \nparameter selection directly impacts the prediction accuracy and \ngeneralization ability of the model. Therefore, to address the above is-\nsues, SRXGWO is introduced to optimize the radial basis kernel function \nparameters and penalty factors in the SVR patient-flow prediction model \nto form the best combination of parameters to improve the prediction \naccuracy and reduce the error size. The specific steps for building the \nSRXGWO-SVR model are as follows. \n(1) Data pre-processing. Routine data pre-processing is performed on \nthe collected patient-flow data, including data cleaning, missing \nvalue processing, outlier processing, etc. \n(2) Establish the objective function. The sample data are substituted \ninto the mean square error minimization function as shown in Eq. \n(26), and then the optimal radial basis kernel function parameters \nC and penalty factor γ are obtained. \nQm
CCσ1\nn̂n\nk1
yk\u0000}yksBtBC∃CminCCmaxCγ∃γminCγmax (36) \nwhere yk denotes the actual size of the patient flow, and √yk denotes the \ncorresponding size value of the patient-flow prediction. \n(3) Search for hyperparameters using SRXGWO. First, the parameters \ninvolved in the SRXGWO algorithm are set initially. The fitness \nfunction RMSE is applied to calculate the fitness values of the \npopulation individuals, where m is the number of samples. RMSE⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪\n1\nm̂m\nk1
yk\u0000}yk2̅\n(37) \n(4) Determine whether the maximum number of iterations is \nreached. The iteration is continued if the maximum number of \niterations is not reached. Suppose the maximum number of iter-\nations is reached. In that case, the C and γ corresponding to the \noptimal individual location information is output. The best \ncombination of the two parameters is applied to build the \nSRXGWO-SVR prediction model. Then the patient-flow dataset is \npredicted. \nThe flow chart of the SRXGWO-SVR prediction model based on \nhospital patient-flow proposed in this section is shown in Fig. 2. \n4.Experimental results and discussions \nIn this section, ablation and benchmark function experiments are \ndesigned to validate the global optimization performance of SRXGWO. \nThen, the proposed SRXGWO-SVR is used in patient-flow prediction \nexperiments to demonstrate the accuracy and validity of SRXGWO-SVR. \n4.1. Benchmark functions comparison experiment \n4.1.1. Benchmark test experiment setup \nFirst, the running environment of the benchmark function test \nexperiment needs to be described. the software of the experiment is \nMatlab2017b and the core hardware is Intel(R) Xeon(R) CPUE5-2660v3 \n(2.60 GHz). The benchmark function test set used in this section is the \ncurrently familiar IEEE CEC2014, described in detail in Table 1. The \ncomparison experiments include SRXGWO and GWO and well-known \nFig. 1.Algorithm flow chart of SRXGWO \nThe algorithmic complexity of SRXGWO comes \nmainly from Sobol sequences, core formula updates, \nCauchy random replacement strategy, and directional \nmutation mechanism. The complexity level of Sobol \nsequence initialization is O
N; the computational \ncomplexity level of the core formula is O
N2\nN∗logN; the computational complexity level of \nCauchy random replacement strategy is O
N∗logN; \nand the complexity level of directional mutation \nmechanism is O
N2. By comprehensive calculation, \nthe overall complexity level of SRXGWO is \nO
SRXGWOO
N2N∗logN. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n7algorithms such as PSO, SCA, etc. Therefore, to ensure the validity and \nfairness of the experiments, all swarm intelligence algorithms are \nsearched in dimension 30, the population size is 30, the number of \nevaluations is also uniformly 300,000, and the internal parameters of \nthe algorithms are all default values. Finally, to ensure the correctness \nand validity of the experimental results, all the algorithms were run \nindependently 30 times, and the results of the experiments were further \nverified using Wilcoxon signed-rank test and the Friedman test. \n4.1.2. Ablation experiments \nIn this section, ablation experiments of SRXGWO were designed to \ndiscuss the effects of Sobol sequence-based population initialization, \nCauchy random replacement strategy, and directional mutation mech -\nanism on the effect of GWO. First, the experiments combined the three \nimproved strategies with GWO by permutation, including GWO itself, \nwith a total of eight algorithms, as shown in Table 2. In the table, S \nstands for Sobol sequence-based population, R stands for Cauchy \nrandom replacement strategy, and X stands for directional mutation \nmechanism. in addition, “1″ indicates that the current strategy is used, \nand “0″ indicates that no strategy is used. For example, SGWO uses the \nSobol sequence but not the other two strategies. \nTable 3 shows the experimental results of SRXGWO with the other \nseven algorithms, including the Wilcoxon signed-rank test results and P- \nvalue. The number of algorithms that are “better than/equal to/worse \nthan ” other algorithms. “Mean ” indicates the average ranking of the 30 \nfunctions tested, and “rank ” indicates the final overall ranking. In the \nresults of the Wilcoxon test, SRXGWO is 23 better than the unimproved \nGWO, which indicates that the improvement of GWO by the three \nimprovement strategies is very significant. In addition, SRXGWO has a \nsignificant advantage over SGWO, RGWO, and XGWO using a single \nmechanism, with at least 14 stronger than them. Finally, SRXGWO has \nan advantage over the two-two combination of SRGWO, SXGWO, and \nRXGWO, indicating that the three SRXGWO improvement strategies are \neffective. The table also shows the empirical p-values, and the bolded data indicate that SRXGWO is significantly different from other algo-\nrithms, and it can be said that the advantage of SRXGWO is more \nprominent compared to other algorithms. In summary, the mechanism \nemployed in SRXGWO is reasonable and effective, and can significantly \nimprove the performance of GWO. \n4.1.3. Comparison of SRXGWO with well-known peer algorithms \nIn this subsection, similar algorithm comparison experiments are \ndesigned based on 30 benchmark functions to compare SRXGWO with \n12 other peer algorithms to demonstrate that the proposed algorithm \nhas more robust optimization performance among the same type of al-\ngorithms. Among the compared algorithms, six original algorithms are \nPSO, SCA, MFO, WOA, BA, and FA, all highly cited algorithms. The other \nsix algorithms are new variants proposed recently, including OBSCA, \nm_SCA, OBLGWO, ACWOA, MOFOA, and SCADE. \nTable 4 shows the experimental results of the comparison. Where \nAVG denotes the average optimal fitness value of 30 independent ex-\nperiments, STD denotes the variance of the experiments, and the bolded \ndata are the optimal values of the current function of the algorithm. In \nthe experimental results, SRXGWO finds the optimal solution relative to \nits peer algorithms in most of the function evaluations, especially in the \nclass of complex functions F23–F30, which indicates that SRXGWO is \nmore advantageous in dealing with complex problems. In addition, the \nSTD fluctuation of SRXGWO is small, which suggests that the algorithm \nhas strong stability. \nSimilarly, to further validate the SRXGWO experimental results, we \nused the Wilcoxon signed-rank test to compare and validate SRXGWO, \nand the results are shown in Table 5; the Friedman test was used to verify \nthe average ranking of SRXGWO, and the results are shown in Fig. 3, \nwhich can be more intuitive to observe the comparison results. The \nWilcoxon signed-rank test results show that SRXGWO ranks first overall \nwhen comparing other algorithms and is at least 19 better than other \nhigh citation algorithms and 20 better than other variants. The Friedman \ntest shows that the average ranking of SRXGWO is slightly different, but \nFig. 2.SRXGWO-SVR prediction model based on hospital patient-flow. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n8it is still better than PSO and MFO algorithms, and the overall perfor -\nmance is also the first. In summary, the results of the comparison \nexperiment are valid and reasonable, and SRXGWO does outperform \nother peer algorithms. \nTo further demonstrate the advantages of SRXGWO over other al-\ngorithms, this experiment recorded the optimization search process of \neach algorithm and plotted it as an iterative curve, as shown in Fig. 4. \nThe horizontal coordinate indicates the number of evaluations, and the \nvertical coordinate indicates the fitness value. Firstly, it can be seen that \nSRXGWO has good convergence accuracy on F6, F8, F9, F10, F11 and \nF13 in unimodal and simple multimodal function classification and \nfaster search speed than other similar algorithms. In addition, it can be \nobserved in the hybrid and combinatorial functions F16, F23, F30 that \nSRXGWO also has excellent results in solving complex optimization \nproblems. Further in the figure, SRXGWO has a clear advantage in the \nF6, F8, F9, F10, and F16 test functions. Both in the search period of the \nsearch process and the exploitation period of the iteration, SRXGWO can \nquickly find the current optimal solution. At the same time, the other algorithms cannot outperform SRXGWO from the beginning to the end. \nIn addition, SRXGWO has a clear decreasing inflection point in the \nmiddle of the algorithm iteration in the function tests of F11 and F16. \nFew other algorithms can continue the development, which indicates \nthat SRXGWO has a strong ability to jump out of the local optimum. \nFinally, the nine function tests in the figure demonstrate that SRXGWO \nhas stronger search and exploitation capabilities than other algorithms \nand is a high-performance optimization algorithm. In future work, it also \nbe applied to more cases, such as optimization of machine learning \nmodels [65], MRI reconstruction [66], service ecosystem [67], compu -\ntational experiments [68,69], power distribution network [70], and \nmedical signals [71,72]. \n4.2. Patient-flow prediction \nThe patient-flow dataset is presented in this section, and SRXGWO- \nSVR training and test experiments are designed. First, the patient flow \ndataset used is presented. Immediately after, the experimental setup \nincluding comparison methods, parameter settings, and evaluation \ncriteria are described. Finally, SRXGWO-SVR is proposed and applied to \nthe prediction of patient flow. \n4.2.1. Patient-flow dataset \nThe data set used in this section is the attendance statistics of \nWenzhou Medical University Hospital in China, which serves a radius of \nnearly 30 million people and has an annual outpatient volume of 5.3 \nmillion. Due to the large volume of data, the latest data from January \n2022 to September 2022 is selected, with a sample size of 240 items. The \ndata ’s main characteristic attribute is “number of appointments, ” and \nthe label attribute is “number of actuals ”. In addition, to reduce the \ndependence of the model on a single time series and the error of the \nprediction results, this paper also selects three independent attribute \nseries, namely, “number of people without pre-deposit system ”, “num-\nber of people without ID”, and “number of late arrivals ”. “Three inde-\npendent attribute series are selected to describe the trend changes of \npatient-flow with the influence of multiple factors. Finally, when col-\nlecting data, there are inevitably null values and outliers, and this paper \nalso preprocesses the data by removing abnormal samples and linear \ninterpolation. Fig. 5 shows a 240-day line graph of actual hospital visits. \nFirst of all, according to Fig. 5, we can see that the number of hospital \nvisits as a whole fluctuates a lot, and there is a local repetition, mostly \nbetween 14,000 and 4,200 visits. The main reason for this phenomenon \nis that the 14,000 visits are during the weekdays, i.e., Monday through \nFriday, when the hospital doctors are in regular attendance and the \nequipment is functioning normally, and the number of visits is relatively \nhigher. The 4,200 visits are due to the fact that most of the departments \nand facilities are closed during the weekends, and the number of visits is \nrelatively low. In addition, it can be seen that the average number of \nhospital visits between 180 and 220 days was very high, reaching \n18,000 at one point, and the number of weekend visits did not drop too \nmuch. This is because this period corresponds to July and August, which \nis the free time of summer vacation, and most people will concentrate on \ntheir visits during this period. In general, this data set shows a cyclical \ndistribution, and the difficulty in building the model is to reduce the \nerror while avoiding the problem of overfitting. \n4.2.2. Experimental setup \nFirst, the numerical settings of the SRXGWO and GWO algorithms \nused for hyperparameter optimization are presented. The number of \npopulations is set to 20, the dimension is defined as 2, the maximum \nnumber of iterations is 50, the upper and lower bounds for the value of C \nare 100 and 0.1, and the upper and lower bounds for the value of R are \nalso 100 and 0.1. Then, to prove the effectiveness of the prediction \nmodel SRXGWO-SVR improvement, the SRXGWO-SVR was compared \nwith GWO-SVR and the original SVR in the experiments. Also, to prove \nthe effectiveness of SRXGWO-SVR model, backpropagation (BP), Table 1 \nDescription of the 30 benchmark functions. \nClass No. Functions F∗\ni\nFi
x∗\nUnimodal Functions 1 Rotated High Conditioned Elliptic \nFunction 100 \n2 Rotated Bent Cigar Function 200 \n3 Rotated Discus Function 300 \nSimple Multimodal \nFunctions 4 Shifted and Rotated Rosenbrock ’s \nFunction 400 \n5 Shifted and Rotated Ackley ’s Function 500 \n6 Shifted and Rotated Weierstrass \nFunction 600 \n7 Shifted and Rotated Griewank ’s \nFunction 700 \n8 Shifted Rastrigin ’s Function 800 \n9 Shifted and Rotated Rastrigin ’s Function 900 \n10 Shifted Schwefel ’s Function 1000 \n11 Shifted and Rotated Schwefel ’s Function 1100 \n12 Shifted and Rotated Katsuura Function 1200 \n13 Shifted and Rotated HappyCat Function 1300 \n14 Shifted and Rotated HGBat Function 1400 \n15 Shifted and Rotated Expanded \nGriewank ’s plus Rosenbrock ’s Function 1500 \n16 Shifted and Rotated Expanded Scaffer ’s \nF6 Function 1600 \nHybrid Functions 17 Hybrid Function 1 (N 3) 1700 \n18 Hybrid Function 2 (N 3) 1800 \n19 Hybrid Function 3 (N 4) 1900 \n20 Hybrid Function 4 (N 4) 2000 \n21 Hybrid Function 5 (N 5) 2100 \n22 Hybrid Function 6 (N 5) 2200 \nComposition \nFunctions 23 Composition Function 1 (N 5) 2300 \n24 Composition Function 2 (N 3) 2400 \n25 Composition Function 3 (N 3) 2500 \n26 Composition Function 4 (N 5) 2600 \n27 Composition Function 5 (N 5) 2700 \n28 Composition Function 6 (N 5) 2800 \n29 Composition Function 7 (N 3) 2900 \n30 Composition Function 8 (N 3) 3000 \nTable 2 \nGWO variants based on three strategies. \nAlgorithms S R X \nSRXGWO 1 1 1 \nGWO 0 0 0 \nSGWO 1 0 0 \nRGWO 0 1 0 \nXGWO 0 0 1 \nSRGWO 1 1 0 \nSXGWO 1 0 1 \nRXGWO 0 1 0 X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n9random forest (RF), KELM, radial basis function network (RBF), con-\nvolutional neural networks (CNN), and other well-known predictive \nclassifiers are added to the comparison experiments. To verify the pre-\ndiction effectiveness of the proposed patient-flow prediction models, \nthree evaluation metrics are applied to evaluate the performance of \nvarious prediction models in this paper. The three-evaluation metrics \nare the spearman correlation coefficient (R2) of Eq. (38), the mean ab-\nsolute error (MAE) of Eq. (39), and the root mean square error (RMSE) of \nEq. (40) for the evaluation analysis. \nR21\u0000⋃m\nk1
yk\u0000}yk2\n⋃m\nk1
yk\u0000}yk2(38) \nMAE1\nm̂m\u00001\ni0†yi\u0000}yi† (39) \nRMSE⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪⎪\n1\nn̂m\nk1
yk\u0000}yk2̅\n(40) \nwhere m is the number of samples, yk is defined as the actual value size \nof the test sample, yk is the mean size of the test sample, and √yk is the \npredicted value of the test sample. \n4.2.3. Prediction results and analysis \nTo perform regression calculations on the decomposed subsequences \nusing the SVR model, the patient-flow data set needs to meet the input \nformat of the SVR model. For this purpose, the original data samples are \nprocessed as follows. \nFirst, for the time series y1Cy2C…yn, define the input matrix. X⎫\n⎭y1⋯ yd\n⋮ ⋱ ⋮\nyn\u0000d⋯ yn\u00001⎩\n⎨ (41) \nwhere d is the step size parameter and is the number of sample attri-\nbutes, which in this paper is 4. \nThen, define the output labels. \ny⎫\n⎭yd1\n⋮\nyn⎩\n⎨ (42) \nFinally, use X and y defined above as the input and label of the SVR \nmodel, respectively. In practice, X and y are divided into a training set \nand a test set in the ratio of 1:1. The training set is used to train the \nmodel and determine the optimal parameters of the model. Then, the \ntrained model is simulated and tested on the test set to demonstrate the \ntraining effect of the prediction model. Finally, the accuracy perfor -\nmance of the model is verified by evaluating the metrics R2, RMSE, and \nMAE. The following are the experimental results and training and test \nsets analysis. \n1. Prediction experiments on the training set \nThe patient–flow dataset is divided into 120 sample sets by 1:1 \ncrossover as the training set for training seven prediction models, \nincluding SRXGWO-SVR, GWO-SVR, SVR, BP, RF, KELM, RBF, and CNN. \nFig. 6 shows the prediction result plot of SRXGWO-SVR. The original \nfold represents the training set’s original data distribution and the Pre-\ndicted fold represents the prediction results given by the SRXGWO-SVR \nmodel. The line graph shows that the overall prediction effect of the \nSRXGWO-SVR model is excellent, especially in the interval of 70–120 \ndays. The Original and Predicted lines nearly overlap, which indicates \nthat the prediction is very accurate. The large deviations between the Table 3 \nResults of Wilcoxon signed-rank test for ablation experiments and P-value. \nItem SRXGWO GWO SGWO RGWO XGWO SRGWO SXGWO RXGWO \n/\u0000/ ~ 23/1/6 15/1/14 14/3/13 18/0/12 6/5/19 9/0/21 9/2/19 \nMean 2.57 6.90 5.40 4.47 4.67 2.93 4.13 3.53 \nRank 1 8 7 5 6 2 4 3 \nF1 N/A 1.9209E-06 1.0246E-05 4.0483E-01 4.7162E-02 2.8948E-01 9.7772E-02 1.6503E-01 \nF2 N/A 1.9209E-06 1.9209E-06 8.3071E-04 1.6394E-05 2.4118E-04 3.7243E-05 3.3269E-02 \nF3 N/A 1.7344E-06 1.7344E-06 6.0350E-03 8.9364E-01 6.8359E-03 6.2683E-02 3.1849E-01 \nF4 N/A 2.3704E-05 3.8822E-06 6.2884E-01 3.6094E-03 4.4052E-01 7.8647E-02 5.9994E-01 \nF5 N/A 1.7344E-06 2.6033E-06 6.8923E-05 1.7344E-06 4.1955E-04 1.7344E-06 8.1302E-01 \nF6 N/A 4.7162E-02 3.1618E-03 7.0356E-01 4.4052E-01 9.0993E-01 9.0993E-01 9.0993E-01 \nF7 N/A 1.7344E-06 1.7344E-06 1.1499E-04 1.2453E-02 4.0715E-05 3.1618E-03 6.5833E-01 \nF8 N/A 1.7344E-06 1.9209E-06 9.3676E-02 1.7344E-06 1.9861E-01 2.3534E-06 7.1889E-01 \nF9 N/A 3.6004E-01 2.9894E-01 2.4308E-02 8.6121E-01 5.5774E-01 2.2888E-01 7.0356E-01 \nF10 N/A 1.7344E-06 1.7344E-06 4.7162E-02 2.1266E-06 4.7162E-02 1.9209E-06 6.2884E-01 \nF11 N/A 7.3433E-01 3.0861E-01 5.0383E-01 4.1653E-01 1.3591E-01 9.2626E-01 5.5774E-01 \nF12 N/A 8.2206E-02 5.4401E-01 5.9836E-02 3.3173E-04 7.7309E-03 1.1079E-02 3.6004E-01 \nF13 N/A 2.2102E-01 3.9333E-01 1.8462E-01 5.5774E-01 2.9894E-01 3.1849E-01 4.1653E-01 \nF14 N/A 1.3975E-02 1.8326E-03 2.6230E-01 8.5896E-02 1.2544E-01 1.7791E-01 2.3694E-01 \nF15 N/A 1.4773E-04 6.3391E-06 3.6826E-02 4.9080E-01 2.7653E-03 1.8462E-01 1.0201E-01 \nF16 N/A 5.3197E-03 2.9575E-03 1.1138E-03 7.5213E-02 2.5637E-02 1.7138E-01 6.5641E-02 \nF17 N/A 9.8421E-03 3.0861E-01 3.1849E-01 8.7297E-03 3.8723E-02 7.1889E-01 6.5833E-01 \nF18 N/A 6.8359E-03 9.3157E-06 8.5896E-02 8.9187E-05 6.5641E-02 1.4936E-05 1.4773E-04 \nF19 N/A 1.4839E-03 8.9443E-04 1.9861E-01 6.4352E-01 1.3591E-01 2.0589E-01 2.1827E-02 \nF20 N/A 1.9209E-06 1.7344E-06 5.3070E-05 5.3044E-01 5.2165E-06 1.5886E-01 7.3433E-01 \nF21 N/A 9.0993E-01 4.7795E-01 7.5213E-02 1.0639E-01 2.1827E-02 8.2901E-01 5.0383E-01 \nF22 N/A 1.6503E-01 6.5641E-02 7.1903E-02 1.6503E-01 3.8203E-01 1.8519E-02 2.4519E-01 \nF23 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF24 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF25 N/A 1.2290E-05 1.0000E00 1.7344E-06 5.6061E-06 1.0000E00 1.0000E00 1.7344E-06 \nF26 N/A 1.9729E-05 1.6566E-02 1.0357E-03 1.3820E-03 1.5286E-01 3.1603E-02 3.1618E-03 \nF27 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF28 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF29 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 \nF30 N/A 1.7344E-06 1.0000E00 1.7344E-06 1.7344E-06 1.0000E00 1.0000E00 1.7344E-06 X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n10Table 4 \nComparison results of SRXGWO with other algorithms. \nFun F1 F2 F3 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.5817E07 8.9158E06 1.7773E08 1.4681E08 4.5005E03 3.3271E03 \nPSO 9.0808E06 1.6903E06 1.4837E08 1.5123E07 9.9378E02 1.2790E�02 \nSCA 2.2839E08 6.9799E07 1.6889E10 2.3915E09 3.7046E04 6.6934E03 \nMFO 8.7549E07 1.0414E08 1.0114E10 5.9855E09 1.0275E05 5.8223E04 \nWOA 2.7540E07 1.1331E07 5.0637E06 8.0209E06 3.2575E04 2.0632E04 \nBA 7.7059E�05 3.5272E�05 5.2698E�05 2.7431E�05 4.2251E�02 1.6464E02 \nFA 2.5269E08 5.1675E07 1.5002E10 1.8122E09 6.4325E04 1.0623E04 \nOBSCA 4.0160E08 1.2958E08 2.4801E10 4.7138E09 5.0550E04 9.2351E03 \nm_SCA 6.3874E07 4.1104E07 6.3318E09 3.7149E09 2.6908E04 6.6947E03 \nOBLGWO 2.2042E07 1.2605E07 1.6887E07 1.2778E07 9.1358E03 3.3451E03 \nACWOA 1.3860E08 6.2461E07 7.4290E09 3.9581E09 5.0191E04 9.0562E03 \nMOFOA 1.2354E09 7.4867E07 7.7038E10 2.4594E09 7.8687E04 3.7238E03 \nSCADE 4.5429E08 1.1842E08 3.0003E10 4.0210E09 5.6160E04 7.2834E03 \nFun F4 F5 F6 \nItem AVG STD AVG STD AVG STD \nSRXGWO 5.4006E02 3.2112E01 5.2075E02 7.2959E-02 6.1118E�02 2.5044E00 \nPSO 4.6707E02 3.2003E�01 5.2095E02 4.0216E-02 6.2317E02 3.2594E00 \nSCA 1.4150E03 2.7588E02 5.2093E02 6.2064E-02 6.3356E02 2.3449E00 \nMFO 1.5209E03 1.0125E03 5.2030E�02 1.6938E-01 6.2361E02 3.5309E00 \nWOA 5.9251E02 6.0017E01 5.2034E02 1.6112E-01 6.3494E02 3.5778E00 \nBA 4.2155E�02 3.2061E01 5.2095E02 6.4791E-02 6.3398E02 3.6948E00 \nFA 1.5337E03 1.5192E02 5.2096E02 4.5044E-02 6.3359E02 9.2350E-01 \nOBSCA 2.3121E03 7.5405E02 5.2095E02 5.7443E-02 6.3205E02 1.4049E00 \nm_SCA 8.0286E02 1.1489E02 5.2056E02 1.4351E-01 6.2212E02 2.8889E00 \nOBLGWO 5.4647E02 4.7860E01 5.2096E02 5.9910E-02 6.1916E02 3.3318E00 \nACWOA 1.1803E03 2.6266E02 5.2085E02 1.7768E-01 6.3363E02 2.7978E00 \nMOFOA 1.0092E04 6.9816E02 5.2106E02 3.7558E-02 6.4079E02 6.7902E-01 \nSCADE 2.2480E03 4.6553E02 5.2097E02 4.3335E-02 6.3428E02 2.4021E00 \nFun F7 F8 F9 \nItem AVG STD AVG STD AVG STD \nSRXGWO 7.0144E02 4.4844E-01 8.3494E�02 6.4659E�00 9.9741E�02 2.4837E01 \nPSO 7.0229E02 1.4348E-01 9.7268E02 2.6092E01 1.1067E03 2.4938E01 \nSCA 8.4528E02 2.6369E01 1.0362E03 1.9353E01 1.1756E03 2.4065E01 \nMFO 7.9627E02 6.3419E01 9.4824E02 3.3320E01 1.1205E03 4.4316E01 \nWOA 7.0099E02 7.2969E-02 9.9955E02 4.1935E01 1.1246E03 5.0520E01 \nBA 7.0066E�02 1.6102E-01 1.0275E03 5.2626E01 1.1641E03 5.6092E01 \nFA 8.4000E02 1.0997E01 1.0240E03 1.2118E01 1.1595E03 1.3038E01 \nOBSCA 9.1758E02 4.4244E01 1.0576E03 1.8074E01 1.1960E03 1.9095E01 \nm_SCA 7.4867E02 2.2125E01 9.3470E02 2.3339E01 1.0491E03 1.9402E01 \nOBLGWO 7.0119E02 9.2779E-02 9.2058E02 3.4783E01 1.0637E03 2.9684E01 \nACWOA 7.3883E02 2.1566E01 9.8681E02 1.5413E01 1.1270E03 1.7226E01 \nMOFOA 1.4082E03 4.6569E01 1.1760E03 1.1881E01 1.2583E03 9.4200E�00 \nSCADE 9.1691E02 4.4469E01 1.0684E03 1.0564E01 1.2058E03 1.8217E01 \nFun F10 F11 F12 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.7815E�03 2.3016E�02 4.1565E�03 1.0677E03 1.2012E03 3.7913E-01 \nPSO 5.0248E03 5.6761E02 5.8289E03 4.4923E02 1.2023E03 3.0765E-01 \nSCA 7.0064E03 5.2529E02 8.0775E03 3.0696E02 1.2025E03 2.1633E-01 \nMFO 4.6021E03 8.7516E02 5.2295E03 7.7681E02 1.2004E�03 1.9653E-01 \nWOA 4.9691E03 7.4150E02 5.8744E03 9.0861E02 1.2017E03 4.7579E-01 \nBA 5.5034E03 5.6881E02 6.0313E03 6.9746E02 1.2011E03 3.5842E-01 \nFA 7.5532E03 3.1957E02 7.9058E03 2.9315E02 1.2026E03 2.3995E-01 \nOBSCA 6.3076E03 4.9831E02 7.3709E03 3.6056E02 1.2022E03 4.1510E-01 \nm_SCA 4.0584E03 7.1133E02 4.7823E03 6.5478E02 1.2008E03 3.3864E-01 \nOBLGWO 3.8703E03 8.9566E02 5.4446E03 1.0838E03 1.2023E03 5.7151E-01 \nACWOA 4.7309E03 7.3276E02 6.1655E03 9.3475E02 1.2018E03 4.7511E-01 \nMOFOA 9.2300E03 3.9968E02 9.0883E03 2.8283E�02 1.2029E03 2.7367E-01 \nSCADE 7.3914E03 2.4356E02 8.2418E03 2.8346E02 1.2026E03 2.4238E-01 \nFun F13 F14 F15 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.3004E�03 7.4709E-02 1.4005E03 2.8662E-01 1.5163E03 6.0201E00 \nPSO 1.3004E03 7.7571E-02 1.4003E03 1.2817E-01 1.5166E03 1.1804E�00 \nSCA 1.3030E03 2.6429E-01 1.4439E03 7.6871E00 5.5707E03 5.0710E03 \nMFO 1.3020E03 1.3201E00 1.4347E03 2.4514E01 2.1529E05 5.9281E05 \nWOA 1.3006E03 1.4348E-01 1.4003E�03 4.2398E-02 1.5738E03 2.6213E01 \nBA 1.3005E03 1.5518E-01 1.4003E03 1.3344E-01 1.5296E03 6.4355E00 \nFA 1.3028E03 1.9987E-01 1.4404E03 4.2258E00 1.4383E04 5.6495E03 \n(continued on next page) X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n11Table 4 (continued ) \nFun F1 F2 F3 \nOBSCA 1.3037E03 3.6249E-01 1.4731E03 1.1450E01 1.7595E04 1.0828E04 \nm_SCA 1.3009E03 7.5448E-01 1.4172E03 7.0904E00 2.1370E03 8.9061E02 \nOBLGWO 1.3005E03 1.1306E-01 1.4004E03 1.7893E-01 1.5162E�03 4.9642E00 \nACWOA 1.3015E03 1.0565E00 1.4197E03 1.4944E01 2.0795E03 6.3700E02 \nMOFOA 1.3081E03 3.0417E-01 1.6411E03 9.7254E00 2.2096E05 3.2757E04 \nSCADE 1.3040E03 3.7540E-01 1.4874E03 8.7317E00 1.9117E04 6.0793E03 \nFun F16 F17 F18 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.6110E�03 4.7850E-01 6.0312E05 7.0192E05 1.0517E04 9.4630E03 \nPSO 1.6120E03 5.3328E-01 2.9096E05 1.3413E05 1.9795E06 5.9660E05 \nSCA 1.6127E03 2.3363E-01 6.2791E06 3.3409E06 1.4952E08 6.6811E07 \nMFO 1.6128E03 4.8942E-01 3.9449E06 7.4952E06 1.2984E08 4.9905E08 \nWOA 1.6124E03 4.0816E-01 4.2933E06 3.4224E06 7.9323E�03 5.7540E�03 \nBA 1.6133E03 3.0344E-01 1.0170E�05 9.2079E�04 9.5662E04 4.7410E04 \nFA 1.6129E03 2.1659E-01 6.7984E06 1.7537E06 3.0346E08 8.6628E07 \nOBSCA 1.6130E03 2.5196E-01 9.4571E06 3.4434E06 1.8689E08 1.1951E08 \nm_SCA 1.6114E03 7.5109E-01 1.7735E06 1.3758E06 2.3094E07 3.3825E07 \nOBLGWO 1.6120E03 4.3556E-01 1.2085E06 9.3079E05 3.4535E04 3.1262E04 \nACWOA 1.6122E03 4.8843E-01 1.5272E07 1.2808E07 5.6959E07 4.4496E07 \nMOFOA 1.6134E03 2.3187E-01 8.7256E07 2.6488E07 5.7808E09 1.0374E09 \nSCADE 1.6127E03 2.0380E-01 1.5384E07 5.7531E06 1.6460E08 8.4537E07 \nFun F19 F20 F21 \nItem AVG STD AVG STD AVG STD \nSRXGWO 1.9173E03 1.3658E01 2.9079E03 1.1572E03 3.7294E05 3.3804E05 \nPSO 1.9172E03 1.9835E�00 2.2959E�03 6.5024E�01 1.1324E05 6.7643E04 \nSCA 1.9893E03 2.5079E01 1.5103E04 3.7270E03 1.4348E06 6.6889E05 \nMFO 1.9738E03 5.5538E01 5.2933E04 4.0442E04 1.0824E06 2.5030E06 \nWOA 1.9384E03 2.7102E01 3.2328E04 2.0050E04 1.1294E06 1.7119E06 \nBA 1.9335E03 3.4019E01 2.4023E03 1.1992E02 6.4514E�04 3.2131E�04 \nFA 2.0050E03 1.2211E01 1.8924E04 7.2016E03 1.6271E06 7.2788E05 \nOBSCA 2.0080E03 9.9922E00 2.9021E04 1.1924E04 1.8445E06 8.5563E05 \nm_SCA 1.9502E03 2.9621E01 1.0791E04 4.6890E03 3.7774E05 5.2345E05 \nOBLGWO 1.9170E�03 1.6997E01 5.6962E03 2.3328E03 5.2217E05 3.6592E05 \nACWOA 2.0080E03 2.5161E01 3.9788E04 1.9571E04 6.9036E06 5.4050E06 \nMOFOA 2.2412E03 1.8281E01 1.4788E05 5.3365E04 3.9619E07 1.4979E07 \nSCADE 2.0087E03 1.1766E01 2.8049E04 9.6164E03 2.3498E06 1.0171E06 \nFun F22 F23 F24 \nItem AVG STD AVG STD AVG STD \nSRXGWO 2.6550E03 1.8361E02 2.5000E�03 0.0000E�00 2.6000E�03 0.0000E�00 \nPSO 2.9439E03 1.8435E02 2.6161E03 5.8346E-01 2.6261E03 5.6750E00 \nSCA 2.9644E03 1.3112E02 2.6653E03 1.3746E01 2.6001E03 6.9049E-02 \nMFO 3.0695E03 2.1885E02 2.6708E03 3.4130E01 2.6722E03 2.7522E01 \nWOA 3.0538E03 2.9728E02 2.6334E03 1.0652E01 2.6118E03 3.7279E01 \nBA 3.3420E03 4.1760E02 2.6152E03 3.0962E-03 2.6654E03 2.6008E01 \nFA 3.0002E03 1.1217E�02 2.7329E03 1.7512E01 2.7050E03 4.5757E00 \nOBSCA 3.1226E03 1.6474E02 2.6858E03 1.7839E01 2.6000E03 3.0468E-04 \nm_SCA 2.6046E�03 2.1219E02 2.6370E03 6.7666E00 2.6000E03 6.8563E-04 \nOBLGWO 2.7106E03 1.7350E02 2.6181E03 1.4048E00 2.6009E03 5.0249E00 \nACWOA 3.1046E03 2.2793E02 2.5122E03 4.6578E01 2.6000E03 5.0998E-06 \nMOFOA 1.8112E04 1.1960E04 2.5000E03 0.0000E00 2.6000E03 0.0000E00 \nSCADE 3.1435E03 1.3870E02 2.5000E03 0.0000E00 2.6000E03 1.9769E-07 \nFun F25 F26 F27 \nItem AVG STD AVG STD AVG STD \nSRXGWO 2.7000E�03 0.0000E�00 2.7004E�03 8.2706E-02 2.9000E�03 0.0000E�00 \nPSO 2.7118E03 7.4419E00 2.7871E03 3.4604E01 3.4367E03 2.8726E02 \nSCA 2.7269E03 8.2372E00 2.7023E03 6.7894E-01 3.4443E03 3.2075E02 \nMFO 2.7194E03 1.1345E01 2.7024E03 1.2575E00 3.6640E03 1.4731E02 \nWOA 2.7153E03 1.6594E01 2.7005E03 1.3903E-01 3.8579E03 2.9527E02 \nBA 2.7314E03 1.2072E01 2.7005E03 1.5158E-01 3.8975E03 3.7586E02 \nFA 2.7336E03 3.7833E00 2.7024E03 3.2727E-01 3.7997E03 2.1775E01 \nOBSCA 2.7000E03 1.4243E-08 2.7040E03 4.1439E-01 3.2568E03 4.0280E01 \nm_SCA 2.7124E03 4.1923E00 2.7008E03 2.1587E-01 3.1851E03 1.2821E02 \nOBLGWO 2.7000E03 0.0000E00 2.7006E03 1.2740E-01 3.1171E03 3.1805E02 \nACWOA 2.7000E03 0.0000E00 2.7636E03 4.8645E01 3.7129E03 3.5075E02 \nMOFOA 2.7000E03 0.0000E00 2.7925E03 2.3425E01 2.9000E03 0.0000E00 \nSCADE 2.7000E03 0.0000E00 2.7070E03 1.7566E01 3.2989E03 1.9042E02 \nFun F28 F29 F30 \nItem AVG STD AVG STD AVG STD \n(continued on next page) X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n12real and predicted series appear on the 3rd day, around the 32nd day, \netc., due to the large fluctuations of the real series, which are difficult to \npredict and lead to the deviation of the model. \nTo illustrate the improvement of SRXGWO-SVR compared to GWO- \nSVR, the iteration curves when SRXGWO and GWO optimized SVR are \nrecorded in this paper, as shown in Fig. 7. The vertical axis represents the fitness value of the swarm intelligence algorithm, i.e., the deviation \nin the model, and the horizontal axis represents the number of iterations. \nThe blue curve represents the iteration curve of SRXGWO-SVR, and the \nbrown curve represents the iteration curve of GWO-SVR. The iterations \nalso confirm that the two hyperparameters of the SRXGWO-SVR pre-\ndiction model are C 76.2569 and R 0.0101. The hyperparameters of \nthe GWO-SVR are C 2.3654 and R 0.0309. Since the overall de-\nviations of both SRXGWO-SVR and GWO-SVR are small, and the process \nof iteration spans an extensive numerical range, we have enlarged the \nkey parts were enlarged. First, in terms of initialization, SRXGWO-SVR \nhas a smaller fitness value than GWO-SVR, which indicates that the \nSobol sequence initialization method enhances the pre-search capability \nof SRXGWO. Then, it can be seen by the magnified image that both \nSRXGWO and GWO find the near-optimal solution at the iteration \nnumber of 2, but it is evident that SRXGWO has a better fitness value for \nthe near-optimal solution. Finally, during the iterations, SRXGWO also \nkeeps searching for the optimal solution, and the fitness value of \nSRXGWO is optimized from 0.0003285 at the beginning to 0.0003271. \nThe fitness value of GWO does not change significantly, and the algo-\nrithm falls into a local optimum. Therefore, it can be said that SRXGWO \ncan improve SVR’s prediction performance more effectively than GWO. \nThis work compares SRXGWO-SVR with well-known classification \nprediction models including GWO-SVR, SVR, BP, RF, KELM, RBF, and Table 4 (continued ) \nFun F1 F2 F3 \nSRXGWO 3.0000E�03 0.0000E�00 3.1000E�03 0.0000E�00 3.2000E�03 0.0000E�00 \nPSO 6.8849E03 8.7157E02 7.4382E04 1.3763E05 1.1678E04 6.2526E03 \nSCA 4.7736E03 2.6752E02 1.2836E07 7.6163E06 2.3980E05 7.9328E04 \nMFO 3.9703E03 2.4525E02 3.6610E06 3.9023E06 6.3694E04 5.2942E04 \nWOA 5.0223E03 6.7902E02 6.3246E06 4.5803E06 7.5080E04 4.8586E04 \nBA 5.1296E03 5.6070E02 3.6448E07 2.6098E07 1.3731E04 1.2024E04 \nFA 4.2282E03 1.4435E02 3.1490E06 8.4923E05 1.7420E05 3.9597E04 \nOBSCA 5.3567E03 2.9466E02 2.0712E07 9.7835E06 3.7443E05 1.9299E05 \nm_SCA 3.8890E03 1.2875E02 1.9729E06 4.4218E06 5.5540E04 2.8810E04 \nOBLGWO 3.4266E03 5.0458E02 4.9452E06 4.3781E06 1.9074E04 1.4566E04 \nACWOA 4.3232E03 1.2224E03 1.8950E07 1.5200E07 3.7383E05 2.2958E05 \nMOFOA 3.0000E03 0.0000E00 3.1000E03 0.0000E00 3.2000E03 0.0000E00 \nSCADE 4.9933E03 8.5262E02 1.5512E07 9.5368E06 4.8922E05 1.6393E05 \nTable 5 \nWilcoxon signed-rank test results of SRXGWO versus other peers. \nAlgorithm /\u0000/ Mean Rank \nSRXGWO ~ 2.13 1 \nPSO 19/8/3 4.80 4 \nSCA 30/0/0 8.57 9 \nMFO 26/2/2 7.33 7 \nWOA 25/4/1 6.13 6 \nBA 20/7/3 5.93 5 \nFA 30/0/0 9.47 10 \nOBSCA 29/0/1 9.70 11 \nm_SCA 26/2/2 4.73 3 \nOBLGWO 20/2/8 4.00 2 \nACWOA 28/0/2 7.57 8 \nMOFOA 23/0/7 10.17 13 \nSCADE 27/0/3 9.87 12 \nFig. 3.Friedman test results of SRXGWO versus other peers. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n13\nFig. 4.Convergence curves of SRXGWO and peer algorithms. \nFig. 5.240-day folding graph of the number of actual hospital visits. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n14CNN to further highlight the benefits of SRXGWO-SVR. It uses R2, RMSE, \nand MAE to assess the accuracy of the predictions. In order to guarantee \nthe stability of the prediction results and prevent chance mistakes, the \n10-fold cross-validation is also utilised in the model training process. \nTable 6 displays the evaluation findings for each model, and it is clear \nthat SRXGWO-SVR performs the best in terms of R2, RMSE, and MAE \nassessment indices. The correlation coefficient, R2, is 0.99879, which \nshows that there is a strong connection between the prediction results of \nthe SRXGWO-SVR model and the actual value. It is clear that SRXGWO- \nSVR performs best in R2, RMSE, and MAE evaluation indices. RMSE and \nMAE are used to evaluate errors. The two forms of SVR errors are the \nleast, with corresponding values of 159.5753 and 100.0009. Following \nline graph analysis, iterative graph analysis, and evaluation result \nanalysis, it can be shown that the SRXGWO-SVR model has a very high \nprediction accuracy and also has more advantages than other \nFig. 6.Prediction results of SRXGWO-SVR. \nFig. 7.Iteration curves of SRXGWO and GWO when optimizing SVR. \nTable 6 \nEvaluation results of each prediction model. \nModel R2 RMSE MAE \nSRXGWO-SVR 0.99879 159.5753 100.0009 \nGWO-SVR 0.99869 159.5886 100.0069 \nSVR 0.99861 166.1568 105.0999 \nBP 0.99820 584.2596 119.5581 \nRF 0.98379 176.6171 335.1838 \nKELM 0.99819 195.6333 144.1484 \nRBF 0.99865 168.8734 110.3226 \nCNN 0.99744 228.9898 110.3226 X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n15algorithms. \n2. Prediction experiments on the test set \nThe model trained by the real sequence must be closer to the training \nset itself, and there may be problems of false accuracy of the prediction \nresults and overfitting of the prediction model. Moreover, the prediction \nproblem, in reality, will not be the same as the real sequence of the \ntraining set, so it is necessary to simulate and test the completed trained \nmodel by the test set. \nFig. 8 shows the prediction fold of SRXGWO-SVR for the test set. \nAgain, the Original fold represents the data distribution of the test set, \nand the Predicted fold represents the prediction results given by the \nSRXGWO-SVR model. It can be seen that SRXGWO-SVR also predicts \nvery well in the test set prediction with high correlation. However, the \ndeviation of SRXGWO-SVR in predicting the test set is more significant \nthan the training set, e.g., the deviation of the dashboard on days \n7,10,13,36 is larger. Therefore, overall, SRXGWO-SVR still has a highly \naccurate prediction performance and does not fall into the overfitting \nproblem when faced with brand-new patient-flow data. However, it \ncannot achieve the results in training. \nTo further explore the performance of SRXGWO-SVR in the face of \nnew sample sequences and to show the advantages of SRXGWO-SVR \nover other algorithms, the test set experiments also compare \nSRXGWO-SVR with well-known classification prediction models such as \nGWO-SVR, SVR, and BP, and evaluate the prediction results using R2, \nRMSE, and MAE. The evaluation results of each model are shown in \nTable 7. It can be seen that SRXGWO-SVR has higher Spearman corre -\nlation and lower error in RMSE, MAE for prediction results compared \nwith GWO-SVR, SVR, which indicates that SRXGWO-SVR still has an \nadvantage over the unimproved GWO-SVR and SVR in the face of new \ndata sets. In addition, it can be seen that SRXGWO-SVR still has a greater \nadvantage over BP, RF, KELM, RBF, and CNN classical models, and \nperforms better in terms of R2, RMSE, and MAE. \nFinally, this paper combines the prediction results of the training set \nand the test set for statistical comparisons in order to further highlight \nthe significance of the training set experiments and the test set experi -\nments, as well as to demonstrate the prediction effectiveness of \nSRXGWO-SVR for various data sets and the advantages of SRXGWO-SVR \nover other algorithms. The comparison findings are shown in Figs. 9–11, \nwhere the horizontal axis represents each comparison model and the \nvertical axis the assessment standards. Fig. 9 shows that when SRXGWO- SVR is moved from the training set to the test set, the prediction rele-\nvance of the model diminishes and that KELM fluctuates the least. \nHowever, SRXGWO-SVR still outperforms KELM in terms of accuracy, \nsuggesting that it may continue to hold the top spot in future patient- \nflow prediction. The assessment findings were normalized in this \nresearch and then shown once more since RMSE and MAE are prediction \nerrors and the difference between the data is too great. Figs. 10 and 11 \nshow intuitively how much more accurate SRXGWO-SVR is than other \nmodels like BP, RF, CNN, and others. Additionally, even after switching \ndatasets, there is little error variation in the SRXGWO-SVR prediction \nresults, demonstrating the model ’s great stability. It can be shown that \nSRXGWO-SVR is a very accurate, highly generalizable, and highly stable \nprediction model based on the experimental findings of the training and \ntest sets. \n5.Conclusions and future works \nThis paper proposes a high-performance optimization algorithm \nSRXGWO and an effective patient-flow prediction model SRXGWO-SVR, \naiming to predict patients ’ medical needs and achieve orderly patient \naccess by analyzing the changing dynamics and objective laws of \nPatient-flow. First, this paper introduces the current research status of \nartificial intelligence technology for predicting patient-flow and finds \nthat the existing prediction models are not strong in prediction accuracy \nand generalization. Therefore, to improve the accuracy and general -\nization of the prediction model, SRXGWO is proposed based on three \nimprovement strategies and GWO, in which the Sobol sequence im-\nproves the solution space coverage of population initialization, Cauchy \nrandom replacement strategy enhances the information exchange be-\ntween individuals, directional mutation mechanism improves the search \nFig. 8.SRXGWO-SVR predictions for the test set. Table 7 \nEvaluation results of each model based on the test set. \nModel R2 RMSE MAE \nSRXGWO-SVR 0.99835 199.0553 125.6847 \nGWO-SVR 0.99802 199.0954 125.7070 \nSVM 0.99783 218.1971 136.1934 \nBP 0.99738 232.2147 150.2261 \nRF 0.97952 701.2146 427.7865 \nKELM 0.99819 291.1310 185.8860 \nRBF 0.99831 201.5883 129.3960 \nCNN 0.98132 628.8679 363.9654 X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n16ability of the algorithm and the ability to jump out of the local optimum. \nThen, the SRXGWO-SVR prediction model is proposed by combining the \nhigh-performance SRXGWO algorithm with the SVR prediction method \nto accurately predict the number of patients and reasonably schedule \nmedical resources. In the experimental part, ablation experiments are \nfirst conducted to compare SRXGWO with GWO combined with different \nmechanisms. It is verified that SRXGWO, with three improved strategies, \nsimultaneously is the strongest performance. Then, SRXGWO is \ncompared with 12 highly cited algorithms, such as PSO, SCA, etc., by 30 \nbenchmark functions to demonstrate that SRXGWO is also superior in \nthe search ability and exploitation ability of peer algorithms. Finally, a \nreal patient-flow dataset is used to validate the prediction ability of the SRXGWO-SVR model. Comparing with the other seven prediction \nmodels, such as BP, CNN, etc., and evaluating R2, RMSE, and MAE, it is \nproved that the prediction results of SRXGWO-SVR are more accurate, \neffective and stronger than other models. \nOf course, the research in this paper also has some limitations. For \nexample, three improvement mechanisms were added to GWO, which \nincreased the algorithm ’s complexity. In the future, we will try to solve \nthis problem using parallel techniques and high-performance com-\nputers. In addition, in future work, we will further enhance SRXGWO \nand SRXGWO-SVR and apply them to more fields. \nFig. 9.R2 comparison results based on two dataset models. \nFig. 10.Comparison results of RMSE based on two dataset models. \nFig. 11.Comparison results of MAE based on two dataset models. X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n17Declaration of competing interest \nThe authors declare that there is no conflict of interests regarding the \npublication of article. \nReferences \n[1]L. Zhang, L. Li, Study on the Equilibrium of Spatial Allocation of Medical Resources \nat Different Levels in Shanghai, Urban Studies, 2019, p. 26. \n[2]D.Y. Zhou, L.Y. Gao, Q.H. Pan, M.F. He, The Impacts of Medical Resources on \nEmerging Self-Limiting Infectious Diseases, vol. 12, Applied Sciences-Basel, 2022 . \n[3]H. Li, D.M. Mu, P. Wang, Y. Li, D.X. Wang, Prediction of obstetric patient flow and \nhorizontal allocation of medical resources based on time series analysis, Front. \nPublic Health 9 (2021) . \n[4]A. Nikakhtar, S.A. Abbasian-Hosseini, H. Gazula, S.M. Hsiang, Social Network \nbased sensitivity analysis for patient flow using computer simulation, Comput. Ind. \nEng. 88 (2015) 264–272. \n[5]A.R. Sharafat, M. Bayati, PatientFlowNet: a deep learning approach to patient flow \nprediction in emergency departments, IEEE Access 9 (2021) 45552 –45561 . \n[6]M. Tavakoli, R. Tavakkoli-Moghaddam, R. Mesbahi, M. Ghanavati-Nejad, \nA. Tajally, Simulation of the COVID-19 patient flow and investigation of the future \npatient arrival using a time-series prediction model: a real-case study, Med. Biol. \nEng. Comput. 60 (2022) 969–990. \n[7]S. Mirjalili, S.M. Mirjalili, A. Lewis, Grey wolf optimizer, Adv. Eng. Software 69 \n(2014) 46–61. \n[8]X.-S. Yang, A new metaheuristic bat-inspired algorithm, in: J.R. Gonz ˘alez, D. \nA. Pelta, C. Cruz, G. Terrazas, N. Krasnogor (Eds.), Nature Inspired Cooperative \nStrategies for Optimization (NICSO 2010), Springer Berlin Heidelberg, Berlin, \nHeidelberg, 2010, pp. 65–74. \n[9]R. Storn, K.J.J.o.G.O. Price, Differential evolution – a simple and efficient heuristic \nfor global, Optimization over Continuous Spaces 11 (1997) 341–359. \n[10] S. Mirjalili, SCA, A Sine Cosine Algorithm for solving optimization problems, \nKnowl. Base Syst. 96 (2016) 120–133. \n[11] S. Mirjalili, A.H. Gandomi, S.Z. Mirjalili, S. Saremi, H. Faris, S.M. Mirjalili, Salp \nSwarm Algorithm: a bio-inspired optimizer for engineering design problems, Adv. \nEng. Software 114 (2017) 163–191. \n[12] S. Mirjalili, A. Lewis, The whale optimization algorithm, Adv. Eng. Software 95 \n(2016) 51–67. \n[13] S. Mirjalili, Moth-flame optimization algorithm: a novel nature-inspired heuristic \nparadigm, Knowl. Base Syst. 89 (2015) 228–249. \n[14] J. Kennedy, R. Eberhart, Particle swarm optimization, in: Proceedings of ICNN ’95 \nvol. 1944, International Conference on Neural Networks, 1995, pp. 1942 –1948 . \n[15] Y. Yang, H. Chen, A.A. Heidari, A.H. Gandomi, Hunger games search: visions, \nconception, implementation, deep analysis, perspectives, and towards performance \nshifts, Expert Syst. Appl. 177 (2021), 114864 . \n[16] A.A. Heidari, S. Mirjalili, H. Faris, I. Aljarah, M. Mafarja, H. Chen, Harris hawks \noptimization: algorithm and applications, Future Generation Computer Systems- \nthe International Journal of Escience 97 (2019) 849–872. \n[17] H. Su, D. Zhao, A. Asghar Heidari, L. Liu, X. Zhang, M. Mafarja, H. Chen, RIME: A \nPhysics-Based Optimization, Neurocomputing, 2023 . \n[18] J. Tu, H. Chen, M. Wang, A.H. Gandomi, The colony predation algorithm, Journal \nof Bionic Engineering 18 (2021) 674–710. \n[19] I. Ahmadianfar, A. Asghar Heidari, A.H. Gandomi, X. Chu, H. Chen, RUN beyond \nthe metaphor: an efficient optimization algorithm based on Runge Kutta method, \nExpert Syst. Appl. (2021), 115079 . \n[20] I. Ahmadianfar, A. Asghar Heidari, S. Noshadian, H. Chen, A.H. Gandomi, INFO: an \nefficient optimization algorithm based on weighted mean of vectors, Expert Syst. \nAppl. (2022), 116516 . \n[21] H. Chen, C. Li, M. Mafarja, A.A. Heidari, Y. Chen, Z. Cai, Slime mould algorithm: a \ncomprehensive review of recent variants and applications, Int. J. Syst. Sci. (2022) \n1–32. \n[22] S. Li, H. Chen, M. Wang, A.A. Heidari, S. Mirjalili, Slime mould algorithm: a new \nmethod for stochastic optimization, Future Generat. Comput. Syst. 111 (2020) \n300–323. \n[23] M. Abd Elaziz, D. Oliva, S. Xiong, An improved opposition-based sine cosine \nalgorithm for global optimization, Expert Syst. Appl. 90 (2017) 484–500. \n[24] C. Qu, Z. Zeng, J. Dai, Z. Yi, W. He, A modified sine-cosine algorithm based on \nneighborhood search and greedy Levy mutation, Comput. Intell. Neurosci. (2018), \n2018) 4231647-4231647 . \n[25] A.A. Heidari, R. Ali Abbaspour, H. Chen, Efficient boosted grey wolf optimizers for \nglobal search and kernel extreme learning machine training, Appl. Soft Comput. 81 \n(2019), 105521 . \n[26] M.A. Elhosseini, A.Y. Haikal, M. Badawy, N. Khashan, Biped robot stability based \non an A–C parametric Whale Optimization Algorithm, Journal of Computational \nScience 31 (2019) 17–32. \n[27] H. Chen, S. Li, A.A. Heidari, P. Wang, J. Li, Y. Yang, M. Wang, C. Huang, Efficient \nmulti-population outpost fruit fly-driven optimizers: framework and advances in \nsupport vector machines, Expert Syst. Appl. (2020) 142. \n[28] H. Nenavath, R.K. Jatoth, Hybridizing sine cosine algorithm with differential \nevolution for global optimization and object tracking, Appl. Soft Comput. 62 \n(2018) 1019 –1043 . \n[29] Y. Zhang, R. Liu, A.A. Heidari, X. Wang, Y. Chen, M. Wang, H. Chen, Towards \naugmented kernel extreme learning models for bankruptcy prediction: algorithmic \nbehavior and comprehensive analysis, Neurocomputing 430 (2021) 185–212. [30] Y. Liu, A.A. Heidari, Z. Cai, G. Liang, H. Chen, Z. Pan, A. Alsufyani, S. Bourouis, \nSimulated annealing-based dynamic step shuffled frog leaping algorithm: optimal \nperformance design and feature selection, Neurocomputing 503 (2022) 325–362. \n[31] Y. Xue, B. Xue, M. Zhang, Self-adaptive particle swarm optimization for large-scale \nfeature selection in classification, ACM Trans. Knowl. Discov. Data 13 (2019) 1–27. \n[32] Y. Xue, X. Cai, F. Neri, A multi-objective evolutionary algorithm with interval \nbased initialization and self-adaptive crossover operator for large-scale feature \nselection in classification, Appl. Soft Comput. 127 (2022), 109420 . \n[33] X. Wang, X. Dong, Y. Zhang, H. Chen, Crisscross Harris hawks optimizer for global \ntasks and feature selection, Journal of Bionic Engineering (2022) 1–22. \n[34] W. Shan, H. Hu, Z. Cai, H. Chen, H. Liu, M. Wang, Y. Teng, Multi-strategies boosted \nmutative crow search algorithm for global tasks: cases of continuous and discrete \noptimization, Journal of Bionic Engineering 19 (2022) 1830 –1849 . \n[35] R. Dong, H. Chen, A.A. Heidari, H. Turabieh, M. Mafarja, S. Wang, Boosted kernel \nsearch: framework, analysis and case studies on the economic emission dispatch \nproblem, Knowl. Base Syst. 233 (2021), 107529 . \n[36] C. Zhao, Y. Zhou, X. Lai, An integrated framework with evolutionary algorithm for \nmulti-scenario multi-objective optimization problems, Inf. Sci. 600 (2022) \n342–361. \n[37] W. Deng, J. Xu, X.Z. Gao, H. Zhao, An enhanced MSIQDE algorithm with novel \nmultiple strategies for global optimization problems, IEEE Transactions on \nSystems, Man, and Cybernetics: Systems 52 (2022) 1578 –1587 . \n[38] G. Sun, R. Han, L. Deng, C. Li, G. Yang, Hierarchical Structure-Based Joint \nOperations Algorithm for Global Optimization, Swarm and Evolutionary \nComputation, 2023, 101311 . \n[39] K. Yu, D. Zhang, J. Liang, K. Chen, C. Yue, K. Qiao, L. Wang, A correlation-guided \nlayered prediction approach for evolutionary dynamic multiobjective \noptimization, IEEE Trans. Evol. Comput. (2022), 1-1. \n[40] G. Sun, G. Yang, G. Zhang, Two-level parameter cooperation-based population \nregeneration framework for differential evolution, Swarm Evol. Comput. 75 \n(2022), 101122 . \n[41] C. Li, G. Sun, L. Deng, L. Qiao, G. Yang, A population state evaluation-based \nimprovement framework for differential evolution, Inf. Sci. 629 (2023) 15–38. \n[42] G. Sun, C. Li, L. Deng, An adaptive regeneration framework based on search space \nadjustment for differential evolution, Neural Comput. Appl. 33 (2021) 9503 –9519 . \n[43] X. Wen, K. Wang, H. Li, H. Sun, H. Wang, L. Jin, A two-stage solution method based \non NSGA-II for Green Multi-Objective integrated process planning and scheduling \nin a battery packaging machinery workshop, Swarm Evol. Comput. 61 (2021), \n100820 . \n[44] G. Wang, E. Fan, G. Zheng, K. Li, H. Huang, Research on Vessel Speed Heading and \nCollision Detection Method Based on AIS Data, Mobile Information Systems, 2022 . \n[45] Y. Xue, Y. Tong, F. Neri, An ensemble of differential evolution and Adam for \ntraining feed-forward neural networks, Inf. Sci. 608 (2022) 453–471. \n[46] J. Chen, Z. Cai, H. Chen, X. Chen, J. Escorcia-Gutierrez, R.F. Mansour, M. Ragab, \nRenal pathology images segmentation based on improved cuckoo search with \ndiffusion mechanism and adaptive beta-hill climbing, Journal of Bionic \nEngineering (2023) . \n[47] Y. Han, W. Chen, A.A. Heidari, H. Chen, Multi-verse optimizer with rosenbrock and \ndiffusion mechanisms for multilevel threshold image segmentation from COVID-19 \nchest X-ray images, Journal of Bionic Engineering 20 (2023) 1198 –1262 . \n[48] J. Xing, H. Zhao, H. Chen, R. Deng, L. Xiao, Boosting whale optimizer with quasi- \noppositional learning and Gaussian barebone for feature selection and COVID-19 \nimage segmentation, Journal of Bionic Engineering 20 (2023) 797–818. \n[49] H. Hu, W. Shan, J. Chen, L. Xing, A.A. Heidari, H. Chen, X. He, M. Wang, Dynamic \nindividual selection and crossover boosted forensic-based investigation algorithm \nfor global optimization and feature selection, Journal of Bionic Engineering \n(2023) . \n[50] X. Wang, X. Dong, Y. Zhang, H. Chen, Crisscross Harris hawks optimizer for global \ntasks and feature selection, Journal of Bionic Engineering 20 (2023) 1153 –1174 . \n[51] C. Lin, P. Wang, A.A. Heidari, X. Zhao, H. Chen, A boosted communicational salp \nswarm algorithm: performance optimization and comprehensive analysis, Journal \nof Bionic Engineering 20 (2023) 1296 –1332 . \n[52] C. Lin, P. Wang, X. Zhao, H. Chen, Double mutational salp swarm algorithm: from \noptimal performance design to analysis, Journal of Bionic Engineering 20 (2023) \n184–211. \n[53] J. Hu, S. Lv, T. Zhou, H. Chen, L. Xiao, X. Huang, L. Wang, P. Wu, Identification of \npulmonary hypertension animal models using a new evolutionary machine \nlearning framework based on blood routine indicators, Journal of Bionic \nEngineering 20 (2023) 762–781. \n[54] J. Liang, K. Qiao, K. Yu, B. Qu, C. Yue, W. Guo, L. Wang, Utilizing the relationship \nbetween unconstrained and constrained pareto fronts for constrained \nmultiobjective optimization, IEEE Trans. Cybern. (2022) 1–14. \n[55] C. Huang, X. Zhou, X. Ran, Y. Liu, W. Deng, W. Deng, Co-evolutionary competitive \nswarm optimizer with three-phase for large-scale complex optimization problem, \nInf. Sci. 619 (2023) 2–18. \n[56] J.S. Chou, J.P.P. Thedja, Metaheuristic optimization within machine learning- \nbased classification system for early warnings related to geotechnical problems, \nAutom. ConStruct. 68 (2016) 65–80. \n[57] A. Kaushik, N. Singal, A hybrid model of wavelet neural network and metaheuristic \nalgorithm for software development effort estimation, Int. J. Inf. Technol. 14 \n(2022) 1689 –1698 . \n[58] M. Mehraein, A. Mohanavelu, S.R. Naganna, C. Kulls, O. Kisi, Monthly Streamflow \nPrediction by Metaheuristic Regression Approaches Considering Satellite \nPrecipitation Data, vol. 14, Water, 2022 . X. Zhang et al. \nComputers in Biology and Medicine 163 (2023) 107166\n18[59] K. Zhu, S. Ying, N.N. Zhang, D.D. Zhu, Software defect prediction based on \nenhanced metaheuristic feature selection optimization and a hybrid deep neural \nnetwork, J. Syst. Software 180 (2021) . \n[60] J.S. Chou, K.H. Yang, J.P. Pampang, P. Anh-Duc, Evolutionary metaheuristic \nintelligence to simulate tensile loads in reinforcement for geosynthetic-reinforced \nsoil structures, Comput. Geotech. 66 (2015) 1–15. \n[61] J.W. Ma, D. Xia, H.X. Guo, Y.K. Wang, X.X. Niu, Z.Y. Liu, S. Jiang, Metaheuristic- \nbased support vector regression for landslide displacement prediction: a \ncomparative study, Landslides 19 (2022) 2489 –2511 . \n[62] N.D. Hoang, D.T. Bui, L. Kuo-Wei, Groutability estimation of grouting processes \nwith cement grouts using differential flower pollination optimized support vector \nmachine, Appl. Soft Comput. 45 (2016) 173–186. \n[63] S. García, A. Fern˘andez, J. Luengo, F. Herrera, Advanced nonparametric tests for \nmultiple comparisons in the design of experiments in computational intelligence \nand data mining: experimental analysis of power, Inf. Sci. 180 (2010) 2044 –2064 . \n[64] J. Derrac, S. García, D. Molina, F. Herrera, A practical tutorial on the use of \nnonparametric statistical tests as a methodology for comparing evolutionary and \nswarm intelligence algorithms, Swarm Evol. Comput. 1 (2011) 3–18. \n[65] C. Zhao, H. Wang, H. Chen, W. Shi, Y. Feng, JAMSNet: a remote pulse extraction \nnetwork based on joint attention and multi-scale fusion, IEEE Trans. Circ. Syst. \nVideo Technol. (2022), 1-1. [66] J. Lv, G. Li, X. Tong, W. Chen, J. Huang, C. Wang, G. Yang, Transfer learning \nenhanced generative adversarial networks for multi-channel MRI reconstruction, \nComput. Biol. Med. 134 (2021), 104504 . \n[67] X. Xue, G. Li, D. Zhou, Y. Zhang, L. Zhang, Y. Zhao, Z. Feng, L. Cui, Z. Zhou, X. Sun, \nResearch roadmap of service ecosystems: a crowd intelligence perspective, \nInternational Journal of Crowd Science 6 (2022) 195–222. \n[68] X. Xue, X.-N. Yu, D.-Y. Zhou, X. Wang, Z.-B. Zhou, F.-Y. Wang, Computational \nExperiments: Past, Present and Future, 2022 arXiv preprint arXiv:2202.13690 . \n[69] X. Xue, X. Yu, D. Zhou, C. Peng, X. Wang, D. Liu, F.-Y. Wang, Computational \nexperiments for complex social systems —Part III: the docking of domain models, \nIEEE Transactions on Computational Social Systems (2023) . \n[70] X. Cao, T. Cao, Z. Xu, B. Zeng, F. Gao, X. Guan, Resilience constrained scheduling of \nmobile emergency resources in electricity-hydrogen distribution network, IEEE \nTrans. Sustain. Energy (2022) 1–15. \n[71] Y. Dai, J. Wu, Y. Fan, J. Wang, J. Niu, F. Gu, S. Shen, MSEva: a musculoskeletal \nrehabilitation evaluation system based on EMG signals, ACM Trans. Sens. Netw. 19 \n(2022) 1–23. \n[72] J. Zhou, X. Zhang, Z. Jiang, Recognition of imbalanced epileptic EEG signals by a \ngraph-based extreme learning machine, Wireless Commun. Mobile Comput. 2021 \n(2021), 5871684 . X. Zhang et al. ",
"metadata": {
"filename": "An enhanced grey wolf optimizer boosted.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\An enhanced grey wolf optimizer boosted.pdf",
- "file_size": 8733829,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:34.531945",
- "content_length": 89059
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\An enhanced grey wolf optimizer boosted.pdf",
+ "size": 8733829,
+ "source": "docs_to_import"
+ },
+ "id": "f81bcb0f-9019-422d-8eb6-9215a5ab70ba"
},
- "d27acfd1-c16a-4c63-8631-72a61182680e": {
- "id": "d27acfd1-c16a-4c63-8631-72a61182680e",
- "content": "[Página 1]\nActa Astronautica 192 (2022) 276–290\nAvailable online 28 December 2021\n0094-5765/© 2021 IAA. Published by Elsevier Ltd. All rights reserved.An industry 4.0 approach to large scale production of satellite \nconstellations. The case study of composite sandwich panel manufacturing \nM. Eugenia,*, T. Querciaa, M. Bernabeia, A. Boschettoa, F. Costantinoa, L. Lampania, \nA. Marchetti Spaccamelab, A. Lombardob, M. Mecellab, L. Querzonib, R. Usingerc, \nM. Aliprandic, A. Stancuc, M.M. Ivagnesd, G. Morabitod, A. Simonid, A. Brand ~aoe, P. Gaudenzia \naDepartment of Mechanical and Aerospace Engineering, University of Rome “La Sapienza ”, Via Eudossiana 18, Rome, 00184, Italy \nbDepartment of Computer, Control, and Management Engineering Antonio Ruberti, University of Rome “La Sapienza ”, Via Ariosto 25, Rome, 00185, Italy \ncRUAG Schweiz AG, RUAG Space, Schaffhauserstrasse 580, 8052, Zürich, Switzerland \ndThales Alenia Space Italy, Via Saccomuro, 24, Rome, 00131, Italy \neEuropean Space Agency, ESTEC: European Space Research and Technology Centre Keplerlaan 1, 2201, AZ Noordwijk, Netherlands \nARTICLE INFO \nKeywords: \nIndustry 4.0 \nSpace 4.0 \nSmart manufacturing \nCyber-physical systems \nInternet of things \nDigital twin \nArtificial intelligence \nSpace Systems MAIT \nMega constellations ABSTRACT \nIn recent years the so-called New Space Economy or Space 4.0 paradigm has seen a number of new commercial \nplayers entering the satellite industry and creating completely new business models, most of which based on very \nlarge constellations consisting of several hundreds or even thousands of satellites. The production of the high \nnumber of satellites involved in modern mega-constellations is bringing in the space industry the necessity of \nimproved and optimized manufacturing approaches suitable for serial production , i.e., standard environment/ \nhigh number of platforms. In this framework, the adoption of Industry 4.0 methodologies within the space in-\ndustry will lead to a significant improvement and optimization of the whole Manufacturing Assembly Integration \nand Testing (MAIT) cycle. The main aim of Industry 4.0 is the creation of intelligent factories where \nmanufacturing technologies are upgraded and transformed by Cyber-Physical Systems (CPSs), the Internet of \nThings (IoT), Cloud Computing and Big Data Analytics with predictive monitoring features. Main element of the \nIndustry 4.0 approach is the synergic use of embedded sensing technologies in the frame of intelligent production \nprocesses, fostering a radical evolution of the industrial values chains, production value chains, and business \nmodels. In the present work, a possible application of the Industry 4.0 concepts to space industry is presented and \ndiscussed in terms of applicability and obtainable advantages. As a case study, the composite sandwich panel \nmanufacturing line of RUAG Space is considered. Particular focus will be given to the development of a CPS, by \nestablishing a control network of sensors (e.g. temperature, location, load) over a targeted MAIT process. \n1.Introduction \nNowadays, the terms “Industry 4.0” and “Smart Manufacturing ” \nhave become extremely popular to address the so-called Fourth Indus -\ntrial Revolution (4IR) [1] where the evolution of connectivity and \ncomputational calculus permit to create a bridge between physical and \nvirtual worlds. This connection is represented by Cyber-Physical Sys-\ntems, which will be the core of the present study. The same revolution \napplies indeed to the space sector. In 2016, while the European indus -\ntrial context was rushing into the innovation of factories to take \nadvantage of this new concept, the space industry – namely, the Euro-\npean Space Agency as its main promoter - followed through and launched the so-called “New Space ” or “Space 4.0” era [2]. The main \nproblem these ambitious initiatives aim to solve can be synthesized as \nmanufacturing inefficiency in a globalized competitive environment, i.e. \nthe slow operational response to customers ’ complex demand driven by \nincreasing availability of open information. \nThe background scenario to take into consideration to understand \nthis revolution is the worldwide rise of ICTs as a disruptive force of \nchange in any context, even society itself [3]. The faster and easier \navailability of data, as much as the greater reachability of people and \nplaces all over the world both physically and remotely, kicked-off an \nunstoppable globalization driver, increasing competitiveness and \nunlocking new opportunities of sharing knowledge to advance research \nor make a profit. For this reason, the rising demand for greater \n*Corresponding author. \nE-mail address: marco.eugeni@uniroma1.it (M. Eugeni). \nContents lists available at ScienceDirect \nActa Astronautica \nu{�~zkw! s{yo|kr o>!ÐÐÐ1ow�o �to~1m{y2w{m k�o2km�kk��~{!\nhttps://doi.org/10.1016/j.actaastro.2021.12.039 \nReceived 25 November 2021; Accepted 23 December 2021\n\n[Página 2]\nActa Astronautica 192 (2022) 276–290\n277connectivity pushed new commercial players to risk large investments in \nthe space industry with completely new business models [4]. American \nventures and start-ups led the way and invented a new concept of \nexploitation of already in vogue small satellites, putting them in large \nconstellations and in LEO to give high-bandwidth, low latency internet \naccess to remote areas or to gather new data from more frequent or \nhigher quality observations. The market segment grew exponentially in \nthe last decade, with constellations of even thousands of satellites being \nalready in the launch phase. Nowadays 2500 satellites are actively \norbiting around Earth and are expected to be 50 k in ten years [4]. Fig. 1 \nshows the present status of the largest constellations: Space X’s Starlink \nis the most ambitious one, followed by Amazon ’s Project Kuiper. Both \nprojects aim at operating more than 1000 satellites at a time. In the top \nten also Airbus One Web can be found, whose bankruptcy had worried \ninvestors and shareholders, up to its recent rescue by UK government to \nconvert it into a navigation system after the loss of participation in \nGalileo project because of Brexit [5]. Among the well-known English \ncompanies, SatRevolution is an example of a company mainly based in \nEurope. However, all these companies have made international coop-\neration with ventures, billionaires, or big space players to allow the \nrealization of their projects. A significant reduction of costs is therefore \nnecessary for the industry to take advantage of such a promising new \nmarket segment and open it to smaller businesses or more traditional \nmanufacturing players. Producing thousands of satellites of high quality \nand with tighter deadlines then becomes the top priority, thus requiring \nan innovative approach to manufacturing processes, which made Smart \nManufacturing and its related technologies the best available solutions. \nThe biggest challenges in trying to reduce costs while leveraging capa-\nbility are the following: \n≡the increased diversification of requirements asked by customers or \nusers. \n≡the short lead-time to market from product development to product \ndelivery, reduced by global competitiveness. ≡the higher quality assurance needed by more complex new tech-\nnologies [3]. \n≡products reliability, stability, and longevity [6]. \nThe main challenge related to cost reduction is linked with the \nlimited-in-time capital investments, especially concerning the cost of \nlaunch. Developments are being made to make smaller, more flexible \nlaunchers at better prices. Analytics, computing power and AI (Artificial \nIntelligence) algorithms can improve the operations management of \nlarge constellations, reducing response times and operating costs. The \ngoal is to reach the autonomous or semiautonomous spacecraft control \nand management [4]. However, in the space industry the scale of vol-\nume product does not allow for the introduction of automation [7] as \nmuch as it does in mass-market sectors, thus generating the need for \nalternative concepts of product and process optimization, relying more \non IT (Information Technologies) than OT (Operational Technologies) \nor, better, on the integration of both. Even when more easily applicable, \na new wave of automation would require the conversion of blue-collar \njobs to white-collar jobs, with a fast reskilling and new training of \nhuman resources toward greater horizontal connectivity and interop -\nerability [8]. In any case, the space industry needs to become “smarter ” \nand its smartness level will be measured by the degree of reflection of its \nproducts and processes in the new digital world, also called the “cy-\nberspace ”. Competition is not between products or processes anymore, \nbut rather between the information services and analytics algorithms \nbehind them. The solution proposed to convey “smartness ” to the real-\nization of large constellations of small satellites can be borrowed by the \nnewest frontiers of Smart Manufacturing, especially in the framework of \n“Industry 4.0” initiatives spread all over the world [9]. Thanks to the \nprinciples of Smart Manufacturing, it is possible to translate a conven -\ntional in-line dedicated manufacturing process into a fully integrated \ndigitalized process using the latest information technologies. The space \nindustry has not a long experience in serial process optimization, \ntherefore it must take advantage of the state of the art in other industries \nto win the challenges previously mentioned and meet the need for a Acronyms/abbreviations \n4IR Fourth Industrial Revolution \nAGV Automated Guidance Vehicle \nALM Application Lifecycle Management \nAM Additive Manufacturing \nAPM Automated Insert Potting Machine \nASIC Application Specific Integrated Circuit \nCPS Cyber-Physical Systems \nCPPS Cyber-Physical Production Systems \nDT Digital Twin \nERP Enterprise Resource Planning ICT Information and Communication Technology \nIoT Internet of Things \nKET Key Enabling Technology \nKPI Key Performance Indicator \nLEO Low Earth Orbit \nMAIT Manufacturing Assembly Integration and Testing \nMEMS Micro Electro-Mechanical Systems \nMES Manufacturing Executive System \nMOM Manufacturing Operations Management \nNDI Non-destructive Inspection technique \nOT Operational Technology \nUT Ultrasonic Testing \nFig. 1.Pareto chart of planned and launched small satellite constellations per number of satellites as of March 2020. [© newspace.im ]. M. Eugeni et al.\n\n[Página 3]\nActa Astronautica 192 (2022) 276–290\n278better capability over cost ratio. It will need to master the latest de-\nvelopments in other industries in the field of Smart Manufacturing and \ntake them to the next level for the first time. Opportunities for stan-\ndardization, modularization and serialization are evident, especially \nbenchmarking with the Do-It-Yourself philosophy that the Chinese \nAerospace industry is trying to pursue with micro and nanosatellites. For \nexample, from a strategic point of view, with respect to German-born \n“Industry 4.0”, Chinese plan (called “Made in China 2025”) has star-\nted with a pilot and then will be extended step by step [10,11]. Their \ngoal is to introduce a comprehensive innovation system on a small scale \nto increase the manufacturing capability index and thus the convenience \nof the product at macro level both for producers and customers [6]. \nBesides the fundamental trend of lowering costs, satellite \nmanufacturing moves toward the concept of universality, that is the \npossibility to leverage international competitiveness for supplies and \nraw materials to set factories for enhanced rapidity, easier maintenance, \nand better upgradability [6]. Following this reasoning, the UK space \nsector in 2019 explored the interesting concept of a “Global Production \nNetwork”, focused on dynamics and thus on the importance of heritage \nto manage associated risks. In particular, the UK sector is trying to \nmitigate risks by counting on well-proven technology and structuring \nsolid relationships with national and international regulators [12]. The \nBrexit and covid-19 pandemic, however, will put the success of this \nviewpoint in doubt for two reasons: first of all, after Brexit the UK will be \nless and less protected by the European Community, being more of a \ncompetitor in the European market, and thus incurring in higher taxa-\ntion for import/export, thus losing the advantage of lowering process \nand product costs [5]; secondly, having disrupted logistics, travel and \ntransportation, the covid-19 and its safety regulations will force Euro-\npean space companies to rapidly invest in new home-made technologies \nin order to keep their workforce and avoid tensions coming from the risk \nof increased unemployment and difficulty to expatriate. \nThe paper is organized as follows: Section 2 explains the theoretical \nbackground necessary to understand the use of cyber-physical systems in \na space factory; Section 3 introduces the approach to its implementation, \nreviewing hardware technology, software technology, sensors systems \nand Non Destructive Inspection (NDI) techniques; in Section 4 the \napproach is applied to the case study on the real process of RUAG’s \nsandwich composite panel manufacturing; eventually, Section 5 pre-\nsents the conclusions of the study. \n2.Theoretical background: cyber-physical systems \nIn this Section, an overview of Smart Manufacturing concepts, tools \nand strategies is presented together with the most adopted SM frame -\nwork, RAMI 4.0, are illustrated. Among SM concepts, the theory of \nCyber-Physical-Systems is highlighted as the foundation for the inte-\ngration of IT and OT enabling the improvement of an MAIT process in \nthe space industry. Concepts, characteristics, and contextualization in a \nproduction environment are given. \n2.1. Smart Manufacturing concepts, tools and strategies \nSmart Manufacturing focuses on establishing intelligent and \ncommunicative systems based on interoperability, i.e. machine-to- \nmachine and human-to-machine interconnections, dealing with a digi-\ntalized data flow from intelligent and distributed system interaction \n[13]. Products, machines, and company processes acquire a higher level \nof knowledge by data acquisition of parameters, e.g. product charac -\nteristics, localization, process parameters (temperature, pressure, speed, \netc.), and also information from the other stakeholders (e.g. customers, \nsuppliers). This data collection is transferred through internal or \nexternal communication networks, to be shared and to enable \nself-control capacities of products, machines, processes. Thus, these el-\nements become “smart”: capable to measure, recognize, communicate, \ncarry out decision-making processes (mostly without man intervention), to activate actions and operations in production [14]. Smart \nmanufacturing in short is “a data intensive application of information \ntechnology at the shop floor level and above to enable intelligent, effi-\ncient, and responsive operations” [15]. To consider a process “smart”, it \nis necessary to satisfy the following characteristics [16]: (I) computeri -\nzation, or the ability to control or monitor operations through pro-\ngrammable logics such as PLC, microcontroller, or microcomputer; (II) \nconnectivity, achieved through communication networks such as 4G, \n5G, Wi-Fi or specialized protocols; (III) visibility; (IV) transparency, \nbuilding an operating history and allowing problem solving based on \nreal data; (V) predictive capacity, adopting models based on algorithms \nthat correlate past operations with the measured real-time parameters; \n(VI) adaptability, allowing the system to adapt its operations. \nSmart Manufacturing strategic action lines are focused to reach im-\nprovements on autonomous interoperability, agility, flexibility, \ndecision-making, efficiency or cost reductions, mass customization, \nservitization [3,17–19]. It enables companies to cope with the chal-\nlenges of producing individualised products as expected by customers \nwith a short lead-time to market and at the cost of mass production [20]. \nSmart Manufacturing relies on the interdisciplinary and complex \nimplementation of several different technologies, such as \nCyber-Physical-Systems, Artificial Intelligence [21], Cloud Computing \n[22], Big Data analytics [23], Machine Learning [24], Internet of Things \n[25], Augmented Reality and Virtual Reality [26], etc. This paper will \nfocus on those selected to the implementation of a CPS architecture in a \ncomplex MAIT process in the space industry. However, a common \nstandard infrastructure is shared among all these technologies, helping \nto contextualize them in the overall product life-cycle value chain: the \nso-called RAMI 4.0 [27]. RAMI 4.0 ensures intercommunication and \nunderstanding across all business units and functions with a \nservice-oriented architecture, starting from physical things and arriving \nto the most digital business processes through a bi-dimensional hori-\nzontal and vertical expansion, following respectively the increase of \nvalue and the increase of authority, see Fig. 2. RAMI 4.0 well represents \nIndustry 4.0 concepts of holistic integration as well as easy interopera -\nbility, modularity and reconfigurability, bringing them directly in the \nstructure of the business, sometimes called enterprise, for its compre -\nhensive service-oriented goals. Being RAMI 4.0 such a complex archi -\ntecture, a hybrid model with the upper layers substituted with \ntraditional MES and/or ERP is under study to fasten its implementation \n[28]. Among all SM tools, the CPS has the best potential to reproduce \nthis framework, being the only one able to also integrate all other \ntechnologies. \n2.2. Cyber-physical systems applied to a manufacturing environment \nRecently, there has been an explosive growth in the development and \nimplementation of various Cyber-Physical Systems (CPS) [29]. CPS \n(cyber-physical systems) are physical systems that incorporate in-\ntegrations of computation, networking-communication, and physical \nprocesses control, see Fig. 3. They are made of heterogeneous cooper -\nating components interacting through a complex, coupled physical \nenvironment operating over many spatial and temporal scales [30]. \nEmbedded computers and networks monitor and control the physical \nprocesses, with feedback loops where physical processes affect compu -\ntations and vice-versa. CPS are defined as transformative technologies \nfor managing interconnected systems between their physical assets and \ncomputational capabilities [31]. CPS are systems of integrated compu -\ntational entities which are in intensive connection with the surrounding \nphysical world and its on-going processes, providing and using, at the \nsame time, data-accessing and data-processing services available on the \nInternet [32]. In other words, CPS can be generally characterized as \n‘‘physical and engineered systems whose operations are monitored, \ncontrolled, coordinated, and integrated by a computing and communi -\ncating core’’ [33]. To this end, CPSs are able to Ref. [34]: M. Eugeni et al.\n\n[Página 4]\nActa Astronautica 192 (2022) 276–290\n279(i) collect data referred to themselves and their environment \n(ii) process and evaluate these data \n(iii) connect and communicate with other systems \n(iv) initiate actions. \nA CPS is defined as a system in which physical objects are required to \nbe accompanied by their representation in the digital world, to be in-\ntegrated with elements with computing, storage, and communication \ncapabilities, and to be networked between them. They are considered \none of the key technological innovations (Key Enabling Technology - \nKET) of the Fourth Industrial Revolution, a transformative technology \nthat can be placed in the foreground for the potential promised for the \ncreation of value along with the three dimensions of the digitalization of \nmanufacturing: the smart product, Smart Manufacturing, and changes in \nthe business models of companies [35]. Smart manufacturing systems \nuse CPS predominantly as a tool to monitor the physical world and make \ndecentralized decisions in the virtual world, often referring to Cyber-Physical Production Systems (CPPS). The growing availability, \naffordability and adaptability of sensors and connection systems are \nincreasing the widespread adoption of CPS and CPPS. Production data \nare easier to be collected and transferred to cloud platforms, where \nanalytics and AI tools permit to analyse and predict the production be-\nhaviours, and consequently act (manually or automatically) to increase \nperformance. A complete CPS should be able to get information from the \nphysical world and act on it, usually after data computations suggested \nthe action to be implemented. CPS should not be confused with IoT, \nbecause IoT is part of a CPS system, that for example could also include \nAI technology. Some insights on these technologies can be found in \nRef. [36]. Fig. 4 shows how CPPS connect a system in the physical world \nand its Digital Twin (in the cyber world), with an important remark \nabout the human-centred vision of these systems. Indeed, the oper-\nator/manager is always needed to check the process reliability and often \nto validate the analysis and the actuating decisions. In the design of a \nCPS it is recommended by Ref. [30] to pay attention to issues of \nFig. 2.RAMI 4.0 architecture is the most common standard framework for the application of Smart Manufacturing to a whole enterprise value chain. The archi -\ntecture is structured on a bi-directional and multi-layer way, with developments going both horizontally, following product life cycle value (procurement to sales) \nand hierarchical levels of complexity (product to connected world) and vertically, expanding from the simple asset (e.g. shop floor equipment) to the entire busi-\nness [27]. \nFig. 3.The figure shows a layout of the Cyber- \nPhysical System of a sensorized MAIT process plant. \nIt illustrates the cycle from physical to cyber domains, \npassing by control, communication and computation \nfunctions. In the computational layer, data records \nand analysis are performed. The Digital Twin re-\nproduces the process plant in the Cyber Domain, \nwhile the Internet of Things allows its communication \nwith the physical domain through the interconnection \nof sensors in an online platform. Eventually, intelli -\ngent analytics can be performed by AI algorithms \nintroduced in the computation phase and aimed at \nimproving the data reports, allowing faster decision- \nmaking, possibly made autonomously or semi- \nautonomously by the process machines themselves. M. Eugeni et al.\n\n[Página 5]\nActa Astronautica 192 (2022) 276–290\n280reliability and security, level of abstraction and architecture styles for \nmodular design and development, new frameworks and algorithms, \nconcepts of dependability, reconfigurability, certifiability and trust-\nworthiness. More research on this topic can be found in Refs. [37,38]. \n3.Approach: implementing a CPS architecture in space \nmanufacturing \nThe main problem a space factory nowadays faces is related to the \nhigh costs of keeping the pace of a competitive technological market, \nleading companies worldwide with the help of new business models to \nlower entry barriers to the segment. Technological innovation inte-\ngrating the newest IT solutions is requested to traditional manufacturing \nshop floors to leverage space long-term heritage while keeping the \nbusiness sustainable. The CPS was chosen among all SM tools, according \nto the features described in the previous chapter, as the best candidate to \ngive a measurable and reliable improvement to a space manufacturing \nprocess. Introducing a CPS into a space manufacturing facility requires a \ntwo-level approach: \n1. Monitoring the product to be manufactured. \n2. Monitoring the production, integration and test means necessary to \ndeliver the product. \nTo fully realize this approach, three main areas of technical \ncompetence have been considered: \n● Hardware technology, to identify the critical operations of a complex \nMAIT process; \n● Software technology, to identify the most performant solutions to \ndigitalize the process; \n● Sensor systems and Non-Destructive Inspection (NDI) techniques, to \nidentify types of sensors and related techniques to enhance product \nand process control and monitoring. \n3.1. Hardware technology \nIn this paragraph an overview of the applications of SM tools from \nthe point of view of Hardware Technology is given. First, the illustration \nof typical production systems will explain the convergence toward the \ncellular system. Then, traditional production characteristics in the space \nindustry will be mentioned and their evolution following SM principles \nfrom the point of view of HW technology will be presented. \nConcerning production systems, the aerospace industry is mainly \ncharacterized by intermittent production and the management of the production is typically based on job-shop criteria [39]. This system type \nis characterized by low volume and high variety with relatively low \nproduction rate and high flexibility. It is also noteworthy that the \nplanning, routing, and scheduling function is typically done for each \npart independently. The efficiency of the machines is low and, to reduce \ncost, they are general purpose machines. The machines and the move -\nments are reduced, and few setup operations are required. On the con-\ntrary, in large-scale productions machines are dedicated, and processing \nparameters are optimized for few types of parts. A continuous flow must \nbe maintained. In this case, high costs and highly specialized machines \nare affordable thanks to the large production volume. This type of \nproduction system is referred to as ‘process-based ’. The addressing of \nresources is completely dedicated to the optimization of specific pro-\ncesses and the routing of the single part reflects the sequence of the \noperations over the selected machine. As a result, the movements are \nmany and the mean lead time is affected. Between these two extremes, a \n‘combination layout ’ is usually proposed in industrial manufacturing. It \nis the so-called ‘cellular production ’ that requires a systematic approach \nin the design methodology that incorporates all the previous benefits \nand can easily move between the extremes, see Fig. 5 [39]. The benefits \nof the cellular production system are widely accepted in industrial \nproduction for the so-called mass customization, but many items must \nbe considered in the space industry. It is particularly important to \nmaintain the quality assurance of the fabricated components and it is \ndifficult to allow the automation of labour-intensive operations and \ncombinations between process options. \nTraditionally, space production systems, besides being of “job-shop ” \ntype, were mainly designed for single units. In Ref. [24] the example of \nBoeing is presented: the focus was on single unit delivery models and \nunique parts were supplied by customized contracts with suppliers \ncoming only from the space industry, with prototypes being qualified on \ndemand. Other traditional features included: (I) most of the documen -\ntation produced and archived in paper; (II) a low presence of automation \nor robotization; (III) single shift/5 days schedule; (IV) long life-cycle \nproducts of typically 10 years; (V) siloed structures for the different \ndepartments; (VI) “push ” approach with large stock of finite product \n[24,40–42]. Most of these characteristics evolved in the framework of \nIndustry 4.0 and Space 4.0 initiatives. The following interesting SM \nconcepts have been applied to HW technology, specifically in the \ncontext of small satellites ’ constellations [43]: \n● Automated Guidance Vehicles (AGVs) \nFig. 4.An example of the implementation of a cyber-physical system in the \nproduction department. The job flows from production orders to machines, \nwhile the decisions rise from machines back up to customer ’s orders. At every \nstage of data gathering and processing, human intervention is always necessary \nto provide advanced monitoring functions and interpreting results [91]. \nFig. 5.Types of production systems in terms of volume & variety and flexibility \n& efficiency. At the extremes, job-shop system qualifies as high variety and high \nflexibility and process-based system as high efficiency & high volume. The \nhybrid type cellular system lies in between. M. Eugeni et al.\n\n[Página 6]\nActa Astronautica 192 (2022) 276–290\n281Equipped with cameras and navigation software, these vehicles \nallow the transportation of heavy components or the final assembly \nthrough the factory. Well known in the automotive industry, this level of \nautomation was used by OneWeb facility in Florida. \n● Spring-based loading machines \nSpecific machines equipped with springs are used to load satellites to \navoid human non ergonomic operations. In general, flexibility of \norientation and vertical movement is required by satellite platforms to \nallow the last operations, when most subassemblies are completed and \nreaching parts is more difficult. \n● Additive Manufacturing (AM) \nAM is based on a layer-by-layer addition of material instead of \ntraditional machining ’s material-removing approach, thus allowing the \nrapid prototyping of even complex geometries thanks to advanced 3D \nsoftware design (for this reason, the technique is also called 3D printing \n[44]). A 3D printing machine was used by Telesat ’s facility in Ottawa, \nCanada, to realize the apertures of the phased-array antennas. This \nallowed the reduction of multiple part numbers into a single standard \none, besides a significant acceleration of times and reduction of costs. \nThe main limitation of the AM manufactured part is of comparable low \nstrength and associated quality, coupled with a high cost of the printing \nmachine system [45]. \n● Robots & cobots \nMultiple robotic solutions were applied for example by Telesat to \nmake repetitive and heavy operations easier, from manipulation of parts \nto cutting. However, these were used only to make prototypes, as the \nmass production is yet to come. The new frontier of robotization con-\ncerns “cobots ”: interconnected and easily programmable; autonomous, \nflexible, and collaborative; able to avoid collisions based on pre-set up \n360•visualizations of the environment; easily programmable [46]. An \nexample of learning cobots for painting can be found in Bombardier \n[47]. \n3.2. Software technology \nStarting from the traditional manufacturing data management sys-\ntems and passing by the concepts of interoperability, given by the In-\ndustrial Internet of Things (IIoT), and digitalization, given by the Digital \nTwin (DT), in this paragraph the most used CPS architecture will be \npresented. \nMany industries adopt the HMI-SCADA System, a comprehensive \nreal-time data control hardware and software architecture for \nManufacturing plants [48]. The Supervisory Control And Data Acquisi -\ntion system (SCADA) represents the overall control system, gathering \nand analysing data in real-time, while the Human Machine Interface \n(HMI) is the software showing data in a digestible format for humans \nthrough computing systems, allowing the interoperability of workers \nand machines. Interacting with equipment through user-friendly SW \ninterfaces, humans can reduce repetitive, unsafe, and heavy work or \nfacilitate their day-to-day process monitoring activities. The \nHMI-SCADA System architecture is based on executive functions and \ncommunicating functions. The executive functions are represented by \nthe field instrumentation (in-house instruments monitoring and con-\ntrolling automation processes) and Remote Terminal Units (RTU) or \nProgrammable Logic Controllers (PLC), whose concepts are mostly \noverlapped and represent the interface between plant equipment and \ntheir computing control units. The communication functions, on the \nother hand, are represented by a data communication layer, transferring \ndata from the plant to the server; a telemetry layer, transmitting and \nreceiving data from external sources (e.g. Earth telecommunication stations or satellite ground stations); the SCADA host or supervisory \nsystem, including the HMI software, representing the data receiving \nserver. In Fig. 6 the system is vertically contextualized as a level of the \noverall complex enterprise system standardized by the ISA95 model \n[49], including the device level at the bottom and the management \n(MOM or MES) and enterprise (ERP) interfaces at the top, the last two \nrepresenting the data analytics and integration platforms. Enterprise \nsystems (ES) or enterprise information systems (EIS) concepts have been \nresearched and utilized for decades, with applications in the aerospace \nindustry being studied at [50]. In some industries, like the pharmaceu -\ntical one, the application of this standard from day one allowed the \ntransition to paperless processes [51]. \nOver the next ten years, the number of connected devices will exceed \nthe number of inhabitants of the world [33]. The IIoT represents a \npossible evolution or integration of the HMI-SCADA System in the new \nindustrial landscape. The IIoT is defined as a network of physical sys-\ntems that can interact with each other thanks to standard communica -\ntion protocols, to achieve a common goal. Physical systems, and \ntherefore ’things ’, are represented by sensors, actuators, communication \nmodules and devices that can collaborate with each other, through \nintelligent components and applied software, and therefore achieve \nobjectives that strongly depend on their ability to transmit and process \ninformation. It is a multi-directional communication between processes, \nincluding the machinery used, the components and the products. The \nmain form of communication allowed by IIoT technology with respect to \nSCADA/HMI is machine-to-machine communication: the devices \ncommunicate directly using programmable electronic devices and \nwireless technologies. This form of interoperability among machines \ncould extensively contribute to the implementation of a CPS architec -\nture. Other recommendable IIoT characteristics are self-optimization, \nself-healing, self-configuration, and self-protection [52]. A use-case of \nIIoT-based architecture applied in aerospace manufacturing can be \nfound in Ref. [53]. \nTo implement a CPS, process physical entities must also have a \nfaithful representation in the digital world. This representation is \ndefined as ‘digital twin ’ (DT). DTs are commonly known as a key enabler \nfor the digital transformation in manufacturing. Different definitions \nagree on features such as (i) connectivity, i.e., the ability to communi -\ncate with other entities, (ii) autonomy, i.e., the possibility to live inde-\npendently from other entities, (iii) homogeneity, i.e., the capability, \nstrictly connected to autonomy, that allows using the same DT regard -\nless of the specific production environment, (iv) easiness of custom -\nization, i.e., the possibility to modify the behaviour of a physical entity \nby using the functionalities exposed by its DT, and (v) traceability, i.e., \nthe ability to trace the activity of the corresponding physical entity. To \nallow traceability, systems based on barcodes, QR codes or RFIDs [54] \nare applied or incorporated in the product. Finally, DTs monitor and \ncontrol the physical entities, where physical entities send data to update \nwhat are commonly referred to as the virtual models [55,56]. Many are \nthe advantages of this concept. First, it is easily useable for small series \nof customized products. Secondly, the DT allows modular simulation: \nbeing able to reproduce the operating system, it allows to modify \nproducts in a flexible way and to speed up innovation processes. The \npossibility of minimizing the time between design and product delivery \nthrough a DT is a good alternative not to change the process itself, which \nis often more complicated and more expensive. What a DT facilitates \nthat other technologies are not able to is the real time reproduction of \nthe system. Real time is a key concept in process monitoring, as the \nevolution of industrial trends follows speed, with dynamic systems \nhandling high volumes of data [57], also thanks to the introduction of \nnew semiconductor materials which can fasten electronical connections \nof process equipment and information systems. A challenge to consider, \nespecially when scaling the concept to a whole process, is the risk to \ndesign closed cycles, with monitoring functions heavy dependant on the \ndigital reproduction itself. Simulation models made of DTs are able to \nembrace the entire value chain and the entire life cycle of the products, M. Eugeni et al.\n\n[Página 7]\nActa Astronautica 192 (2022) 276–290\n282thus providing the necessary parameters not only to make fast and \nshort-term decisions, but also to allow more sustainable decisions in the \nlong-term, using the permanent collection of data through historical \nseries, which become rich material for statistical models to build more \naccurate correlation coefficients and to show more complete predictive \ngraphical instruments for trends ’ interpretation. \nThe integration of the SCADA/HMI level with the machine-to- \nmachine communication characteristic of the IIoT, linked to a 3D real- time throughout the process virtual representation of all sensors and \nmachinery using DTs, allows the implementation of a fully compre -\nhensive CPS architecture. The so-called CPS 5C level architecture [58] \nclearly defines, through sequential activity flows, the architecture of a \nCPS starting from the initial data acquisition, up to the creation of final \nvalue. The architecture is characterized by five levels: 1. Smart \nConnection Level: guarantees the timely and reliable acquisition of data \nfrom sensors, controllers or company production systems (e.g. ERP, \nFig. 6.Pyramidal architecture of an overall enterprise SM standards ’ system, showing the incorporation of HMI/SCADA level [53]. \nFig. 7.CPS 5-levels architecture is the most used. Levels of connection, conversion, cyber, cognition and configuration are shown. Related assets, users, and functions \nare displayed. [58]. M. Eugeni et al.\n\n[Página 8]\nActa Astronautica 192 (2022) 276–290\n283MOM, MES). It is central, considering the heterogeneity of the data, to \nselect appropriate data acquisition methods and sensors (in terms of \ntypes and specifications). 2. Data-to-Information Conversion Level: \nconverts the data collected into significant information through specific \nalgorithms and analysis. 3. Cyber-Level: acts as a central hub, where all \nthe information deriving from the various machines and components, \narrives and creates an intelligent network. They are then analysed to \nunderstand specific or collective information about the state of the \nsystem and evaluated to predict future events. 4. Cognition-Level: the \nimplementation of the CPS at this level generates an in-depth knowledge \nof the monitored system, a valuable support in the decision-making \nprocess. This knowledge allows operators to manage the system opti-\nmally. To ensure visibility, clarity, and immediacy in the understanding \nof the system by the operators, it is often necessary to implement graphic \nanalysis and representations. 5. Configuration-Level: the configuration \nlevel constitutes the feedback of the cyberspace in the physical space and \nacts as a supervisory control to make the machines self-configuring and \nself-adaptable. It acts as a resilience control system (RCS) and allows to \nmonitor, prevent, and correct the systems. This 5-level framework is by \nfar the main reference for CPS. Fig. 7 represents its levels and functions. \nFuture developments of CPS use the 5G protocol network, aiming to low \nlatency (ms) and high data rates (Gbps) [59]. With 6G, already started to \nbe studied, even Tbps could be reached [60]. However, it is estimated \nthat 6G will not be implemented until 2030 [61]. \n3.3. Sensor systems and non-destructive inspection techniques \nSensor systems and Non-Destructive Inspection techniques are \nreviewed as suitable to be integrated in the framework of a CPS to \nimprove the process by faster and more qualitative structural health \nmonitoring. Specific references to advantages in the space industry are \nmade. \n3.3.1. Fiber optics sensors \nThe principle of fiber optics sensors is that of an input light reflected \non a fiber and showing an interference pattern passing by a light de-\ntector. Fiber optics sensors can measure all traditional sensed parame -\nters in structural health monitoring (e.g. strain, temperature, crack \npropagation, leakage, corrosion). Fiber optic sensors have numerous \nadvantages for application in aerospace. In fact, they are lightweight, \ncan be easily embedded into composite structures and are immune to \nelectromagnetic interference. Furthermore, considering that a huge \nnumber of sensors will be necessary to completely cover the structural \nelements of a space structure, the multiplexing capability of optical fi-\nbers, that is the possibility of writing several sensors into one single \nfiber, results in a notable advantage both in terms of low complexity and \nlow weight. The fact that optical fibers do not involve any electric signal \nis a clear advantage from a safety point of view. Other advantages are \nthe long-term stability, low signal losses, the ability to operate in a wide \nrange of temperatures. Drawbacks using these sensors are the difficulty \nin replacing or repairing the fiber if it fails and some technological dif-\nficulties at cryogenic conditions such as low response time for hydrogen \nsensing and low sensitivity for temperature measurements. One of the \nmost interesting applications for small satellite manufacturing is that it \nis possible to monitor the degree of cure by simply measuring the \nrefractive index changes in isothermal conditions [62]. Optical fibers \ncan also be employed for chemical sensing during the cure of composite \nmaterials. At NASA-Langley chemical spectra were obtained using single \nmode optical fibers [63]. \n3.3.2. Acoustic emission sensors \nAcoustic emission (AE) sensors resort to the analysis of emissions \nfrom active defects and are sensitive to defect activity when a structure \nis loaded either during service or a proof test. AE analysis is a useful \nmethod for the investigation of local damage in materials. It is also \npossible to observe damage processes during the entire load history without any disturbance to the specimen. Acoustic emission sensors are \nused for monitoring a wide number of defects in materials such as dy-\nnamic strain, crack growth, leakage, corrosion, delamination, fiber \nbreakage. They are particularly suitable for monitoring the material \nfatigue behaviour since dynamic strain is measured. Conventional \ntechnologies used for AE monitoring are piezoelectric transducers, but \nfiber optic-based AE sensing technology is gaining more and more \nconsideration for the already mentioned advantages related to the use of \noptical fibers [64]. In-flight AE sensors have been successfully demon -\nstrated on the DC-XA in-flight experimentation vehicle. The AE moni -\ntoring system was conceived to have information on temperature limits, \nvibrations, noise characterization and to provide in-flight data from the \nLH2 cryogenic tank. The control unit AEFIS [65] (Acoustic Emission \nFlight Instrumentation System) was able to monitor and send informa -\ntion to the on-board computer for real-time monitoring. The system was \nalso conceived for active monitoring through excitation of the acoustic \nemission sensors. A health monitoring system with 48 sensors for strain \nand hydrogen monitoring was used on the composite hydrogen tank for \nthe X-33 experimental vehicle during on-ground tests. In addition, AE \nsensors for high temperatures have been developed for the structural \nmonitoring of the nose TPS on the X-38, now cancelled. Acoustic \nemission sensors have also been successfully applied during static tests \nof the X34 composite wing. \n3.3.3. Piezoelectric materials \nPiezoelectric materials [79] are composite materials with incorpo -\nrated electrical connections. Under the application of stress, their elec-\ntrodes are excited and the material is charged. Moreover, it manifests a \nlinear change in shape. Charge and linear change represent the char-\nacteristics of such materials in terms of their dual use as actuators \n(transforming electrical energy into mechanical energy) and sensors \n(detecting possible defects measuring structural variations). The most \ncommon family of piezoelectric materials is the so-called PZT (zirconate \ntitanate family). Used as actuators, PZTs sensors can actively monitor \nthe structure. Functioning as both transmitters and receivers, they can \nbe part of a flexible structural health monitoring system capable of \nperforming several evaluation functions. Presently, they can be used for \nactive damage detection with high-frequency electro-mechanical \nimpedance method, or active damage detection with the pulse-echo and \npitch-catch techniques using Lamb-waves, or as passive sensors for \nlow-impact damage detection and acoustic emission detection [66]. \nFurthermore, they can be used in a phased array of sensors that allows, \nthrough the superposition of the generated waves, to focus or steer the \nbeam in a specific direction. Several studies have demonstrated the \ncapability of piezoelectric sensors for damage detection in composite \nmaterials. Studies at ONERA have demonstrated that Lamb waves are \nsensitive to debonding caused by low impact in a sandwich structure \n[67]. \n3.3.4. Micro-Electromechanical Systems (MEMS) \nMicro-Electromechanical Systems (MEMS) are thin-film devices \nproduced through photolithography and chemical etching. Sensors for \ntemperature and pressure measurements are already available as com-\nmercial off-the-shelf products, but other MEMS sensors exist such as \naccelerometers, gyros, acoustic emission, and chemical sensors. The \nadvantage of using MEMS sensors is their small size and potentially low \ncost. They can be easily embedded or surface bonded. Furthermore, with \nan ASIC (Application Specific Integrated Circuit) technology, it is \npossible to create a microsystem of different sensors in one single chip \n[68]. Some issues for structural health monitoring of aerospace struc-\ntures are the temperature range that goes to the best from \u000050 •C to \n175 •C. Furthermore, the temperature dependency of some sensors \nmay affect the measurements, thus limiting the performance [69]. \nDevelopment is required to attain space qualification, and most of all, \nthese devices should be tested in real environment conditions. Another \nissue is the packaging of MEMS sensors. As an example, a smart layer M. Eugeni et al.\n\n[Página 9]\nActa Astronautica 192 (2022) 276–290\n284composed by PZT sensors developed by Acellent Tech. Inc. has been \nembedded into a composite laminate that was also equipped with an \nelectromagnetic layer to measure electrical resistance properties. \n3.3.5. Self-monitoring materials \nSome structural materials can be used as self-monitoring materials, \nwhich means they can sense their own strain and damage by measuring \ntheir electrical resistance [70]. Carbon fiber-reinforced polymers are \nvery suitable as self-monitoring materials since the fibers are electrically \nconducting and the electrical properties of the material are sensitive to \ndamage. Self-monitoring materials are intrinsically smart, which means \nthey don’t need embedded or attached sensors, so they have some ad-\nvantages like low cost, simple design, great durability, large sensing \nvolume and absence of mechanical property degradation due to \nembedding of sensors. A problem of concern for electrical measurements \nis the electrostatic disturbance due to the electrical charging of the \nstructure when flying through charged atmosphere at high speeds or in \norbit due to encounter with ionized molecules. Another issue to be \naddressed is the ability to locate the damage in large composite struc-\ntures. As an alternative, CFRP self-healing materials are under study \n[71], having the advantage of using a new ISOX (iso-\ncyanurate-oxazolidone) thermosetting matrix able to restructure itself in \ncase of delamination or debonding (fiber breakages are not detectable \nthough). \n3.3.6. Thermocouples, strain gauges and accelerometers \nTogether with new sensing technologies such as optical fibers, pie-\nzoelectrics and so on, conventional sensors are also used for structural \nhealth monitoring in the space industry. The major issues for sensors \nsuch as thermocouples, strain gauges and accelerometers are the weight \npenalty from the sensor itself, but also from the wires required to pro-\nvide power and data communication. Wireless transceivers can be used \nto overcome this penalty. These have been flight tested at NASA in the \nframe of ARIES experiment as part of an Integrated Vehicle Health \nMonitoring architecture [72]. The transceivers radio frequency emis-\nsions have been demonstrated to not have interference with communi -\ncation and navigation antennas. \n3.3.7. Thermography \nThermographic methods are non-destructive inspection methods in \nwhich the presence of flaws is determined by monitoring the flow of heat \nover the surface of a structure after some external introduction of a \ntemperature gradient [73]. The presence of flaws disrupts the normal \npattern of heat flow that would be expected in a sound structure. The \nmethod is more sensitive to flaws near the surface. Modern thermo -\ngraphic systems commonly use infrared (IR) cameras to detect radiated \nheat and are controlled by TV video electronics which sample the field of \nview at a typical rate of 50 Hz, allowing temperature variations on a 20 \nms timescale to be resolved [74]. The camera is sensitive to temperature \nchanges of about 0.005 •C and covers a chosen range of temperature, \n4 •C and 8 •C being commonly suitable, although operation is feasible \nbetween \u000050 •C and 100 •C. Liquid crystal coatings and pyroelectric \ndetectors have also been used to detect IR radiation. Thermographic \nmethods fall broadly into two groups: active methods, and passive \nmethods. Active methods are those in which the thermal gradient is \nproduced and continuously maintained by the application of cyclic \nstress. An interesting application of IR thermographic technique is the \ninstallation of a thermo-camera to an unmanned aerial vehicle for the \nmonitoring of defects at the distance of 2 m and 6 m [75]. Passive \nmethods are those in which the thermal gradient results from a transient \nchange. Passive methods are the most widely applied NDI techniques in \ncomposites inspection. Also, non-IR conductive thermography has been \napplied to aerospace applications, such as in the field of Maintenance, \nRepair and Overhaul (MRO), being able to identify defects in a laminate \ncomposite at low temperature [76]. 3.3.8. Ultrasonic testing \nUltrasonic testing (UT) is the most widely used non-destructive in-\nspection method for the examination of composites [77]. On micro -\nscopically homogenous materials (i.e. non-composite) it is commonly \nused in the frequency range 20 kHz to 20 MHz. With composite mate-\nrials the testing range is significantly reduced because of the increased \nattenuation, so the operating frequency limit is usually 5 MHz or less. \nHowever, the ability to resolve small flaws will also be reduced. In most \ntechniques, short pulses of ultrasound (typically a few microseconds) are \npassed into the composite material and detected after having interro -\ngated the structure. The techniques include pulse-echo, through- -\ntransmission, back-scattering, acoustic-ultrasonics, and ultrasonic \nspectroscopy. In these methods, it is important to avoid frequencies at \nwhich resonance occurs between ply interfaces. For unidirectional plies \nspaced at 8 plies/mm this frequency is usually about 12 Mhz. There may \nbe an additional resonance for woven fabrics at approximately 6 Mhz for \n0.25 mm plies, although resonance at other frequencies has been seen in \npractice. Different approaches can be used: manual, immersion, and \nlaser testing. Moreover, an example of combination of UT and conven -\ntional IR thermography techniques is presented in Ref. [78], using car-\nbon/epoxy patches bonded on an aluminium plate and producing fusion \nalgorithms correlating both inspection results. \nSensors and NDI techniques above mentioned can be object of trade- \noff analyses to improve space manufacturing processes according to \ncustomers ’ requests, mainly to increase factories ’ KPIs like Quality of \nService (QoS) and defects rate. In the following chapter the case study of \na real space manufacturing process includes the assessment on the use of \nsome of these technologies. \n4.Case study: RUAG ’s composite sandwich panel manufacturing \nAs a case study, RUAG ’s composite sandwich panel manufacturing \nprocess was taken in consideration. Panel manufacturing today is still a \nlargely manual process. This is especially valid for large, non-serial \nspacecrafts for scientific missions. With the establishment of constella -\ntions during the last years, considerable effort was made to industrialise \nthe overall manufacturing process. Still, the state-of-the-art \nmanufacturing process is distant from an Industry 4.0 philosophy. \nHere follow the main process areas (according to the job-shop pro-\nduction system), each of which is made of stations, operations and \nphases: \n- Parts preparation: procured and stored parts (cut-to-shape \naluminium face sheets, already expanded aluminium honeycomb \ncore, adhesives, foams, inserts and heat pipes) are machined and pre- \nassembled. Parts whose surface is destined to external exposure are \ntreated under galvanic bath to prevent corrosion. \n- Panel assembly: the pre-assembly is bonded under hot press. \n- Panel inspection and testing: Non-Destructive Inspections (NDI) \ntechniques (e.g., Ultrasonic Inspection - UT) and testing (e.g., flat-\nwise tensile strength) are performed. \n- Panel equipment: hot-bonded inserts are automatically potted, and \ncold-bonded inserts are machined; thermal equipment (e.g., paint \nand heat pipes) is integrated. \nFor a summary of existing sensors or automated equipment deliv-\nering process data, with related measurement properties and units the \nreader can refer to Table 1 \nThe general approach to industrial panel manufacturing varies at \ndifferent points compared to the more traditional solution. With \nindustrial-based manufacturing, materials and processes are tailored to \nthe product itself. In the case of the sandwich panels, this means that \nface sheets are already procured cut to shape. Furthermore, time- \nconsuming processes are being automized, as for instance the bonding \nof inserts. A two-level approach has been considered to improve the \nprocess as shown in Fig. 9. M. Eugeni et al.\n\n[Página 10]\nActa Astronautica 192 (2022) 276–290\n285First, existing data must be collected, categorized, and interpreted, so \nthat bottlenecks or shortcomings can be more easily identified. Different \ntypes of sensors and non-destructive sensing techniques were identified, \nsee Fig. 10(a) and Fig. 10(b) showing how a sensing network is deployed \nover the observed process. The collection and categorization of data will \nbe possible thanks to the improvement of the built-in traceability sys-\ntem, extending it to the whole process and introducing an IoT infra-\nstructure based on a sensors network and a data processing and analytics \nplatform. Process parameters like pressure, humidity and temperature \nare tracked, as well as product part numbers; optical imagery aids \nquality control, sound alarms on thresholds helps day-by-day opera -\ntions; all these functions (and more programmable ones according to \nproduction needs) are so interconnected and easily monitorable by a \ndashboard by means the software architecture shown in Fig. 10 (c)). \nObservations are used to perform an AS-IS analysis about data \ncollection within the case process. The Acatech maturity model is chosen \nas a foundation for the development of a new assessment model to \nrepresent the current smartness level of the process. The new assessment \nmodel is based both on the evaluation of single activities, which is \ncrucial to thoroughly verify every operating step of the process, and on \nthe assessment of the whole process, which allows to identify transversal \nintegration elements, which would be otherwise scarcely visible. The \nfirst assessment focuses on the single activities. This process is based \nupon a customization of the Acatech model, which ensures a digital \nmaturity level assessment comprising six maturity stages: from a not- \ndigitalized company to a company with all the features of Industry \n4.0. This model was adjusted to the objective of the assessment, i.e. to \nmeasure the smartness of the process in terms of data collection, and to \nassess single activities. In particular, a qualitative assessment was per-\nformed, and the achievement of a smart level was evaluated according to \nthe maturity model ’s features of computerization, connectivity, visibil -\nity, transparency, predictive capacity and adaptability, see Sec. 2.1 and \nFig. 8. \nAn analysis of possible gatherings of new useful information by new \ntechnologies or new stations can be conducted once the already avail-\nable are collected and analysed by means a suitable software and \ncomputing infrastructure. In case the interpretation of data executed at \nstep 1 needed a deeper insight or critical points in the process were \nidentified, some new technologies should be added accordingly. One of Table 1 \nAs-is process: sensors and automated equipment with related measurement pa-\nrameters and units. © RUAG Space. \nPROCESS \nSTATION/ \nOPERATION/ \nPHASE SENSOR/ \nEQUIPMENT MEASUREMENT \nPROPERTY MEASUREMENT \nUNIT \nParts preparation/ \nPanel milling Laser External dimensions \n(lenght, width, pocket \npositions) Mm \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Timer Time of bath S \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition pH \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition Concentration \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Timer Time of bath S \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition pH \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition Concentration \nParts preparation/ \nAdhesives/ \nIncoming \ninspection Tensile \ntesting \nmachine Lap shear strenght Mpa \nParts preparation/ \nAdhesives/ \nStoring Timer Storage time S \nSandwich \nassembly/ \nSandwich layup Laser Alignment Mm \nSandwich \nassembly/Panel \nbonding Hot press Pressure Bar \nSandwich \nassembly/Panel \nbonding Hot press Temperature •C \nSandwich \nassembly/Panel \nbonding Hot press Time S \nPanel inspection \nand testing/ \nUltrasonic \ninspection Sensor Panel defects \n(delamination, \ninhomogeneity, \nbonding defects, etc.) dB \nPanel inspection \nand testing/ \nFlatwise tensile \ntest Tensile \ntesting \nmachine Tensile strenght Mpa \nPanel inspection \nand testing/3- \npoint and 4- \npoint bending \ntest Tensile \ntesting \nmachine Bending strenght Mpa \nPanel inspection \nand testing/ \nThermal cycling Thermal \nchamber Outgassing % \nPanel equipment/ \nInsert potting APM Insert-injected \nadhesive mass G Table 1 (continued ) \nPROCESS \nSTATION/ \nOPERATION/ \nPHASE SENSOR/ \nEQUIPMENT MEASUREMENT \nPROPERTY MEASUREMENT \nUNIT \nPanel equipment/ \nInsert potting APM Adhesive mixing ratio % \nPanel equipment/ \nInsert potting APM Insert height w.r.t. \nfacesheet Mm \nPanel equipment/ \nInsert potting APM Insert angle w.r.t. \nfacesheet Rad \nPanel equipment/ \nInsert potting APM Insert position Mm \nPanel equipment/ \nAdhesive curing Sensor oven Curing temperature •C \nPanel equipment/ \nAdhesive curing Sensor oven Curing time S \nPanel equipment/ \nInsert proof-load \ntest Sensor Load-displacement \ndiagram N/mm \nPanel equipment/ \nInsert pull-out \ntest Sensor Pull-out load N \nPanel equipment/ \nHeater bonding Laser Position Mm \nPanel equipment/ \nMLI bonding Testing \nmachine Bonding strenght Mpa \nPanel equipment/ \nTie-base bonding Testing \nmachine Bonding strenght Mpa M. Eugeni et al.\n\n[Página 11]\nActa Astronautica 192 (2022) 276–290\n286the main process phases to concentrate on to add information is quality \ntesting. Quality testing usually requires long times and heavily impacts \nboth the technical and economic aspects of the process. Making it more \nagile and automating it would fasten the process and make the testing \nitself more accurate thanks to incorporated statistical models. In our \ncase study, testing stations J4 and J5 right after panel bonding (process \nstep 11) and J2 just after panel machining and inserts installation by \nmeans of RUAG ’s fully automated APM technology (process step 13) is \nof particular interest for future developments. \nThe method proposed to assess the AS-IS process status will be \napplied to understand the steps required to reach the desired “smart ” \nlevel, in terms of individual activities, and to understand how to generate a greater level of interconnection and be able to monitor a \ngreater number of performances. If, for instance, the intention is to \nguarantee that the available data generates an “Enterprise ” level of \ninterconnection throughout the entire process, see Fig. 6, it would be \nnecessary to guarantee a circulation of data that goes beyond the com-\npany ’s internal borders, in an extensive and transversal manner between \nthe various constituent areas. The aim of this study ’s CPS is to reach the \nManufacturing Operations Management (MOM) or Manufacturing \nExecution System (MES) level. However, its inherent feature of scal-\nability allows the extension from the single process to the overall factory \nto the overall plant. \nAn example of process improvement through the application of the \nFig. 8.An example of process performance assessment using the AS-IS model. \nFig. 9.The study ’s approach has two levels: data collection and interpretation, aimed to gather data from the process, and CPS architecture and implementation, to \ndigitalize the existing data and possibly add new information. Measurement of KPIs is then applied to both industrial and digital aspects of the study to verify \nimprovements. M. Eugeni et al.\n\n[Página 12]\nActa Astronautica 192 (2022) 276–290\n287CPS model was realized through a preliminary simulation of raw data \ncoming from the Automated Potting Machine and connected to the \nsoftware architecture described in Fig. 11. First of all, the APM data \n(represented by a list of measurements and their timestamp) is included \nin a database. Once the database is collected, data is normalized and aggregated online according to the different timeframes and stored in \nthe data lake. In batches, such data is clustered and displayed in a \ndashboard. The data collection and visualization allow the monitoring, \ncontrol, and use of data analysis to detect process deviations for example \nto stop the line or alert operators. A possible dashboard and an example \nFig. 10.RUAG ’s sandwich composite panel manufacturing process is shown before (a) and after (b) the integration of existing sensors with an IoT network com-\nmanded by a computing infrastructure. Sensors measure temperature, pressure and humidity and scan panel ’s surface through optical and laser systems. Traceability \nis also performed through barcodes. The whole process is included in a tree-shaped system. The computing infrastructure is then represented in detail (c). Online \nprocessing of sensors ’ data inputs is performed through actions including preprocessing, normalization, thresholds ’ check, and monitoring. Processed data is then \nstored in a data lake, where users are able to have continuous open access, while data are interpreted by a statistical model-based closed-loop of KPIs ’ prediction and \nforecast and are displayed through a user-friendly visual dashboard. Some of the many SW platforms available in the market to realize such concept are mentioned \n[81–86,88–90,92]. \nFig. 11.The CPS∕layers as a flux of data from input \nto output. In the first layer data from interconnected \nsensors (IoT) is simulated or collected from historical \narchives, so that the process is reconstructed (DT). In \nthe cyber layer, i.e. the core of the CPS, data collec -\ntion, storage and analytics is done with the help of \nstatistical predictive models, allowing data correla -\ntion (AI). In the final layer, data can be visualized \nthrough reports and insights and interpreted with \nhuman touch, allowing to understand causation \neffects. M. Eugeni et al.\n\n[Página 13]\nActa Astronautica 192 (2022) 276–290\n288of graphs displayable as output are shown respectively in Fig. 12 and \nFig. 13. \nThe approach was extended to the whole process thanks to its layout \nreconstruction in the cyber space, see Fig. 10. In Fig. 14 a representation \nof the Sandwich Panel Manufacturing process using BPMN and a simu-\nlation through Bizagi Modeler allows the performance of a «what if » \nanalysis. This tool is useful to investigate costs and times needed to \nexecute the entire process. A top-down approach was applied: starting \nfrom a model of the macro-tasks, and then defining each task following \nthe most left representation to define each block as an independent \nprocess. This allows a detailed analysis, gaining a more realistic repre -\nsentation on the timing of the macro-block. Finally, the model reaches \nautomation level and is upgraded with a Markov-chain-based AI algo-\nrithm able to show probabilities of failure for sample properties of in-\nterest. The system upgrade can be categorized in three levels: \n1. Level 1 – “Process monitoring ” \nThis level is characterized by the ability of the CPS to process the \ncollected data automatically generating reports and sending alarms, \nbased on inputs pregiven manually. In case of failures being signalized, \nthe information provided allows the operators and/or process engineers \nto intervene and adjust the process parameters to address the issue. \nReports can assist in the identification of trends by displaying data over \na longer period. \n2. Level 2 – “Small-scale process control ” \nAt this level, further analysis and interpretation is performed auto-\nmatically by the AI algorithms to predict the outcome of the process. For \ninstance, the CPS can stop and restart the potting process with a new \ninsert if the probability of negative process outcome is high. Based on \nidentified trends, the CPS can signal potential failures before they occur. \nHowever, the system is incapable of adjusting any of the process pa-\nrameters to keep the process running and avoid the identified threats. \n3. Level 3 – “Large-scale process control ” \nAt level 3, the AI-assisted CPS can optimize the process parameters to \nachieve optimal process result – delivering the right product quality in \nthe shortest production time. It can perform continuous predictive analysis on all production system components using the data fed in real- \ntime by the sensor network. Based on the performance forecast, the CPS \ncan predict the completion time for each panel, tool exchange rates, and \nequipment maintenance intervals, thereby being able plan the entire \nmaterial flow through the station. At this stage, multiple production \nstations can be interconnected using the same CPS. \nTo reach these levels, capital investment in upgrading the production \nsystem is necessary. Table 2 shows estimated investment figures needed \nto support the CPS implementation. \n5.Conclusions \nThe paper contextualized Smart Manufacturing technologies in the \nfast-evolving market of large constellations of small satellites and \nrelated new production paradigms. A review of fundamental theoretical \nconcepts behind Industry 4.0 disruptive change was presented, focused \non Cyber-Physical Systems and their 5C-level standard architecture. \nPossible Smart Manufacturing solutions, in terms of hardware and \nsoftware technologies, were reviewed to contribute to a future signifi -\ncant improvement and optimization of a whole MAIT cycle. CPS, DT and \nIoT were selected as the most promising technologies to be adopted and \nRUAG ’s composite sandwich panel manufacturing process was taken as \ncase study. The process was reconstructed so that each sensor could be \nsimulated in the cyber space as a flux of data. In parallel, an assessment \nof the SM level of the process according to the Acatech maturity model \nwas carried on unlocking the process improvement potential. The flux of \ndata flowing from the sensing layer into the cyber layer of the CPS \nthrough an interconnected IoT network is represented by unit blocks \nrelated to each process step. The use of AI upgrades the model, giving it \nthe ability to also reach some level of process control and optimization. \nThree different levels of process improvement are identified each of \nwhich is linked to its economic estimation of the necessary computing \ninfrastructure. By this model equipment data can be interpreted through \nits pre-processing, normalization, storage and distribution to a user- \nfriendly visual dashboard, according to a new logical analysis of the \nindustrial process, delivering the final improvement, represented by the \nopportunity of reconfiguring the production line to reach the goals \nmeasured by traditional Key Performance Indicators (KPIs), among \nwhich panel production rate and Overall Equipment Efficiency (OEE), \nand optimize specific parameters related to SM, such as process agility \nand flexibility and the CPS scalability. \nFig. 12.An example of the CPS dashboard. M. Eugeni et al.\n\n[Página 14]\nActa Astronautica 192 (2022) 276–290\n289Declaration of competing interest \nThe authors declare that they have no known competing financial \ninterests or personal relationships that could have appeared to influence \nthe work reported in this paper. \nAcknowledgment \nThe present paper results from the project “Smart Manufacturing for \nfuture constellations ” funded by the European Space Agency (ESA ITT \nAO/1 –10002/19/NL/AR for technology development) and developed in \ncollaboration by Sapienza University of Rome, Thales Alenia Space Italy \nand RUAG Space. \nReferences \n[1]M. Blanchet, THINK ACT. INDUSTRY 4.0. The New Industrial Revolution. How \nEurope Will Succeed, Roland Berger, March 2014 . [2]E. S. Agency, What Is Space 4.0? [Online]. Available, November 2021. November \n2021, https://www.esa.int/About_Us/Ministerial_Council_2016/What_is_space_4. \n0. \n[3]R.Y. Zhong, X. Xua, E. Klotz, S.T. Newmanc, Intelligent manufacturing in the \ncontext of industry 4.0: a review, Engineering 3 (2017) 616–630. \n[4]C. Daehnick, I. Klinghoffer, B. Maritz, B. Wiseman, “Large LEO Satellite \nConstellations: Will it Be Different This Time?, ” McKinsey &Co, Aerospace and \nDefence Practice, May 2020 . \n[5]UK saves OneWeb, Spaceflight 62 (September) (2020) . \n[6]J. Hou, Y. Zhao, Y. Zhou, X. Du and Z. Li, “The creative application of DIY \nmanufacturing technology in remote sensing satellite, ” Aero. China. Vol. 17. N.2, \nSummer 2016. \n[7]K. Jackson, K. Efthymioua, J. Borton, “Digital Manufacturing and Flexible \nAssembly Technologies for Reconfigurable Aerospace Production Systems, ” \nChangeable, Agile, Reconfigurable & Virtual Production Conference, 2016 . \n[8]A. Kusiak, Smart manufacturing, Int. J. Prod. Res. 56 (2018) 508–517. \n[9]S. Marigonda, “Smart Manufacturing: sfide e opportunit ˇa.,” Digital Tools 4.0. \n[10] L. Li, China ’s manufacturing locus in 2025: with a comparison of “Made-in-China \n2025 ” and “Industry 4.0, Technol. Forecast. Soc. Change 135 (2018) 66–74. \n[11] L.D. Xu, Industry 4.0: state of the art and future trends, Int. J. Prod. Res. 56 (8) \n(2018) . \n[12] C. Bryson, Heritage and Satellite Manufacturing: Firm-Level Competitiveness and \nthe Management of Risk in Global Production Networks, Economic Geography, \n2019, pp. 423–441. \n[13] C. Salkin, M. Oner, A. Ustundag, E. Cevikcan, A Conceptual Framework for \nIndustry 4.0, 2018 . \n[14] K. Nakamoto, K. Shirase, Simulation technologies for the development of an \nautonomous and intelligent machine tool, Int. J. Autom. Technol. (2013), https:// \ndoi.org/10.20965/ijat.2013.p0006 . \n[15] K.D. Thoben, S. Wiesner, T. Wuest, Industrie 4.0’ and smart manufacturing – a \nreview of research issues and application examples, Int. J. Autom. Technol. 11 (1) \n(January 2017) 4–16. \n[16] G.G. Schuh, Industrie 4.0 Maturity Index. Managing the Digital Transformation of \nCompanies [Online]. Available:, 2017. February 2021, https://hal.archives-ouver \ntes.fr/hal-02455705 . \n[17] V. Cruz-Machado, Scanning the industry 4.0: a literature review on technologies \nfor manufacturing systems, Engineering Science and Technology, an International \nJournal 22 (3) (June 2019) 899–919. \n[18] D.P. Perales, F.A. Valero, A.B. García, Industry 4.0, A Classification Scheme, 2018 . \n[19] O. Cardin, Classification of cyber-physical production systems applications: \nproposition of an analysis framework, Comput. Ind. 104 (January 2019) 11–21, \nhttps://doi.org/10.1016/j.compind.2018.10.002 . \n[20] A. Rojko, Industry 4.0 concept: background and overview, International Journal of \nInteractive Mobile Technologies 11 (5) (2017) . \nFig. 13.Examples of graphs showable by the dashboard: the first graph represents the single operation ’s timing vs time, the second one the production efficiency vs \ntime and the last one a map of discarded APM inserts for adhesive quantity. Scales are not shown for confidential reasons. \nFig. 14.The process layout represented in the cyber space and its focus at APM. \nTable 2 \nProduction volume requirements - rough order of magnitude estimates. \nCPS \nUpgrade \nLevel Level \nDescription Estimated \nMachine \nProcurement \nCost Increase \n[%] Estimated CPS \nImplementation \nand Operation \nCost [EUR] Minimum \nProduction \nVolume \n[inserts] \nLevel 1 Process \nmonitoring 3–5 42∕000 €/5 years 20.000 \nLevel 2 Small-scale \nprocess \ncontrol 10–15 55∕500 €/5 years 200.000 \nLevel 3 Large-scale \nprocess \ncontrol 40–60 82∕500 €/5 years 1.000.000 M. Eugeni et al.\n\n[Página 15]\nActa Astronautica 192 (2022) 276–290\n290[21] B.-h. Li, H. Bao-cun, L. Xiao-bing, Y. Chun-wei, Y. Wen-tao, Applications of \nartificial intelligence in intelligent manufacturing: a review, Frontiers of \nInformation Technology & Electronic Engineering 18 (1) (2017) 86–96. \n[22] J. Jadaan, K.S. Siderska, Cloud manufacturing: a service-oriented manufacturing, \nEngineering Management in Production and Services 10 (1) (2018) 22–31. \n[23] N. Khan, I. Yaqoob, I. Abaker, T. Hashem, Z. Inayat, W. Kamaleldin, A. Mahmoud, \nM. Alam, M. Shiraz, A. Gani, Big Data: Survey, Technologies, Opportunities, and \nChallenges, ” The Scientific World Journal, July 2014 . \n[24] C. Duke, G. Sadlier, D. Herr, Industry 4.0 and the Future of UK Space, ” London \nEconomics, 2019 . \n[25] E. Sisinni, A. Saifullah, S. Han, U. Jennehag, M. Gidlung, Industrial internet of \nthings: challenges, opportunities, and directions, IEEE Trans. Ind. Inf. 10 (10) \n(2018) . \n[26] H. Li, Application research of virtual reality and augmented reality, Advances in \nIntelligent Systems and Computing 1233 (2021) 494–499. \n[27] Federal Ministry for Economic Affairs and Energy, Plattform Industrie 4.0 - \nRAMI4.0 – a reference framework for digitalisation, Plattf. Ind. 4 (2019), 0. \n[28] M. Yli-Ojanper aa, S. Sierla, N. Papakonstantinou, V. Vyatkin, Adapting an agile \nmanufacturing concept to the reference architecture model industry 4.0: a survey \nand case study, Journal of Industrial Information Integration 15 (2019) 147–160. \n[29] J.H. Kim, A review of cyber-physical system research relevant to the emerging IT \ntrends: industry 4.0, IoT, big data, and cloud computing, Journal of Industrial \nIntegration and Management 2 (3) (2017) . \n[30] H. Gill, R. Baheti, Cyber-physical systems, in: T. Samad, A.M. Annaswamy (Eds.), \nThe Impact of Control Technology, 2011 . \n[31] H. Gill, R. Baheti, Cyber-physical Systems: from Theory to Practice, 2011 . \n[32] L. Monostori, Cyber-physical systems in manufacturing, CIRP Ann 65 (2) (2016) \n621–641. \n[33] R. Rajkumar, I. Lee, L. Sha, J. Stankovic, Cyber-physical systems: the next \ncomputing revolution, Des. Autom. Conf. (2010) 731–736. \n[34] A. Napoleone, M. Macchi, A. Pozzetti, A review on the characteristics of cyber- \nphysical systems for the future smart factories, J. Manuf. Syst. 54 (December) \n(2019) . \n[35] S. Thiede, M. Juraschek, C. Herrmann, Implementing cyber-physical production \nsystems in learning factories, Procedia CIRP 54 (2016) 7–12. \n[36] C. Zhan, Y. Chen, A review of research relevant to the emerging industry trends: \nindustry 4.0, IoT, blockchain, and business analytics, Journal of Industrial \nIntegration and Management 5 (1) (2020) 165–180. \n[37] H. Chen, Theoretical foundations for cyber-physical systems: a literature review, \nJournal of Industrial Integration and Management 2 (3) (2017) . \n[38] Y. Lu, Cyber physical system (CPS)-based industry 4.0: a survey. Journal of \nIndustrial Integration and Management, Journal of Industrial Integration and \nManagement 2 (3) (2017) . \n[39] G.K. Rand, N. Singh, D. Rajamani, Cellular manufacturing systems design, planning \nand control, J. Oper. Res. Soc. (1997) . \n[40] T. Pultarova, “Satellite Manufacturing in a State of Transition, ” [Online]. \nAvailable: http://interactive.satellitetoday.com/via/march-2019/satellite-manu \nfacturing-in-a-state-of-transition/_fragment.html . [Accessed October 2020]. \n[41] P.M. Laurent Jaffarta, Constellations: The satellite serial production challenge, in: \n71st International Astronautical Congress (IAC) – the CyberSpace Edition, October \n2020, pp. 12–14. \n[42] e. directory, “WorldView legion constellation, ” European Space Agency. [Online]. \n[Accessed February 2021]. \n[43] C. Hofacker, How to Make a Megaconstellation, March 2020 [Online]. Available: \nhttps://aerospaceamerica.aiaa.org . \n[44] T. Gornet, T. Wohlers, History of Additive Manufacturing, ” Wohlers, 2014 . \n[45] A. Javaid, M. Haleem, Additive manufacturing applications in industry 4.0: a \nreview, Journal of Industrial Integration and Management 4 (4) (2019) . \n[46] K. Schwab, The Fourth Industrial Revolution, Portfolio Penguin, 2017 . \n[47] A. B˘ecue, CyberFactory#1 – securing the Industry 4.0 with cyber-ranges and digital \ntwins, in: IEEE, 2018 . \n[48] HMI/SCADA software in the age of Industrial IoT and evolving human machine \ninterfaces, ” I-Scoop, [Online]. Available: https://www.i-scoop.eu/industry-4-0/h \nmi-scada-software/ . [Accessed February 2021]. \n[49] Y. Lu, Current Standards Landscape for Smart Manufacturing Systems, ” National \nInstitute of Standards and Technology - US Department of Commerce, February \n2016 . \n[50] H. Wang, Enterprise system and its application in aerospace industry, Journal of \nIndustrial Integration and Management 2 (2) (2017) . \n[51] I.C. Reinhardt, Current perspectives on the development of industry 4.0 in the \npharmaceutical sector, Journal of Industrial Information Integration 18 (3) (2020) . \n[52] H. Wu, S. Li, L.D. Xu, Internet of things in industries: a survey, IEEE Trans. Ind. Inf. \n10 (4) (2014) 2233 –2243 . \n[53] A. B˘ecue, A new concept of digital twin supporting optimization and resilience of \nfactories of the future, Appl. Sci. 10 (2020) 4482 . \n[54] T. Fei, Z. Meng, Digital twin shop-floor: a new shop-floor paradigm towards smart \nmanufacturing, IEEE Access 5 (2017) . \n[55] H. Gill, R. Baheti, Cyber-physical systems. The impact of control technology, IEEE \nControl Systems Society 1 (2011) . \n[56] E.A. Lee, Cyber physical systems: design challenges, in: 11th IEEE. International \nSymposium on Object and Component-Oriented Real-Time Distributed Computing, \nISORC)., 2008, pp. 363–369. [57] M. Abdirad, A two-stage metaheuristic algorithm for the dynamic vehicle routing \nproblem in industry 4.0 approach, J. Manag. Anal. 1 (15) (2020) . \n[58] J. Lee, B. Bagheri, H.A. Kao, A Cyber-Physical Systems architecture for Industry \n4.0-based manufacturing systems, Manufacturing Letters 3 (2015) 18–23. \n[59] G. Aceto, V. Persico, A. Pescap ˘e, Industry 4.0 and health: internet of things, big \ndata, and cloud computing for healthcare 4.0, Journal of Industrial Information \nIntegration 18 (2020) . \n[60] X. You, Towards 6G Wireless Communication Networks: Vision, Enabling \nTechnologies, and New Paradigm Shifts, vol. 64, Science China - Information \nSciences, 2021 . \n[61] Y. Lu, Security in 6G: the prospects and the relevant technologies, Journal of \nIndustrial Integration and Management 5 (3) (2020) 271–289. \n[62] A. Cusano, P. Salvarezza, G. Breglio, A. Cutolo, A. Calabr ˇo, M. Giordano, S. De \nNicola, An integrated fiber optic sensing system for in situ characterization of the \ncuring, Proc. SPIE 4328 (2001) 275–284. \n[63] K.H. Wood, T.L. Brown, M.C. Wu, C.B. Gause, Fiber Optic Sensors for Cure-Health, \n” Proceeding 3rd Intern. Workshop on Structural Health, 2001, pp. 1149 –1157 . \n[64] K. Saddik, M. Alam, A. El, C2ps: a digital twin architecture reference model for the \ncloud-based cyber-physical systems, IEEE Access 5 (2017) 2050 –2062 . \n[65] R.D. Finlayson, M. Friesel, M. Carlos, P. Cole, Health monitoring of aerospace \nstructures with acoustic emission and acousto-ultrasonics, in: 15th World \nConference on Non-destructive Testing, 2000 . \n[66] V. Giurgiutiu, A. Zagrai, J.J. Bao, Piezoelectric wafer embedded active sensors for \naging aircraft structural health monitoring, Int. J. Struct. Health Monitor. \nNovember (2001) . \n[67] D. Devillers, F. Taillade, D. Osmont, D. Balageas, D. Royer, Interaction of Lamb \nwaves with defects in composite sandwich structures, in: European COST F3 \nConference on System, 2000 . \n[68] J.S. Kim, K.J. Vinoy, V.K. Varadan, Wireless health monitoring of cracks in \nstructures with MEMS-IDT sensors, Proc. SPIE 4700 (2002) 342–353. \n[69] S.J. Burgett, M. Kranz, MEMS sensor systems developments at AMCOM for \nenvironmental conditions monitoring, in: Proc. 3 Rd Intern. Workshop on \nStructural Health Monitoring, 2001, pp. 1134 –1141 . \n[70] D. Chung, Structural health monitoring by electrical resistance measurement, \nJournal of smart materials and structures 10 (2001) 624–636. \n[71] L. Zhang, Novel self-healing CFRP composites with high glass transition \ntemperatures, Compos. Sci. Technol. 168 (2018) 96–103. \n[72] W.H. Prosser, T.L. Brown, S.E. Woodard, G.A. Fleming, E.G. Cooper, Sensor \ntechnology for integrated vehicle health management of aerospace vehicles, in: AIP \nConference Proceedings, vol. 657, 2003, p. 1582 . \n[73] P. Gaudenzi, M. Bernabei, E. Dati, G. De Angelis, M. Marrone, L. Lampani, On the \nevaluation of impact damage on composite materials by comparing different NDI \ntechniques, Compos. Struct. 118 (2014) 257–266. \n[74] X. Maldague, Theory and Practice of Infrared Thermography for Non Destructive \nTesting, John Wiley & Sons, Canada, 2001 . \n[75] S. Deane, Application of NDT thermographic imaging of aerospace structures, \nInfrared Phys. Technol. 97 (2019) 456–466. \n[76] D.I. Gillespie, Defect detection in aerospace sandwich composite panels using \nconductive thermography and contact sensors, Sensors 20 (2020) . \n[77] R.D. Finlayson, M. Friesel, M. Carlos, P. Cole, Health monitoring of aerospace \nstructures with acoustic emissions and acousto-ultrasonics, in: 15th World \nConference on Non-destructive Testing, October 2020 . \n[78] P. Daryabor, M.S. Safizadeh, Image fusion of ultrasonic and thermographic \ninspection of carbon/epoxy patches bonded to an aluminum plate, NDT E Int. 90 \n(2017) 1–10. \n[79] P. Gaudenzi, Smart Structures: Physical Behaviour, Mathematical Modelling and \nApplications, John Wiley Sons, 2009 . \n[81] [Online]. Available:, Elastic, November 2021. Accessed November 2021, http \ns://www.elastic.co/ . \n[82] Grafana [Online]. Available: Accessed November 2021, https://grafana.com/ , \nNovember 2021. \n[83] Ignite [Online]. Available: Accessed November 2021, https://ignite.apache.org/ , \nNovember 2021. \n[84] Kafka [Online]. Available: Accessed November 2021, https://kafka.apache.org/ , \nNovember 2021. \n[85] Kibana [Online]. Available: Accessed November 2021, https://www.elastic. \nco/kibana/ , November 2021. \n[86] Pytorch [Online]. Available: Accessed November 2021, https://pytorch.org/ , \nNovember 2021. \n[88] [Online]. Available:, Scikit Learn, November 2021. Accessed November 2021, \nhttps://scikit-learn.org/ . \n[89] Tensorflow [Online]. Available: Accessed November 2021, https://www.tensor \nflow.org/ , November 2021. \n[90] Redis [Online]. Available: Accessed November 2021, https://redis.io , November \n2021. \n[91] M. Li, “Spatial-Temporal Finite Element Analytics for CPS-Enabled Smart Factory: \nApplication in Hybrid Flow Shop, ” Procedia Manufacturing, 2020, pp. 1229 –1236 . \n[92] “Flink Flink [Online]. Available: Accessed November 2021, https://flink.apache. \norg/, November 2021. M. Eugeni et al.",
+ "93a802ed-36af-48c8-ac94-4bac559d4f39": {
+ "content": "Acta Astronautica 192 (2022) 276–290\nAvailable online 28 December 2021\n0094-5765/© 2021 IAA. Published by Elsevier Ltd. All rights reserved.An industry 4.0 approach to large scale production of satellite \nconstellations. The case study of composite sandwich panel manufacturing \nM. Eugenia,*, T. Querciaa, M. Bernabeia, A. Boschettoa, F. Costantinoa, L. Lampania, \nA. Marchetti Spaccamelab, A. Lombardob, M. Mecellab, L. Querzonib, R. Usingerc, \nM. Aliprandic, A. Stancuc, M.M. Ivagnesd, G. Morabitod, A. Simonid, A. Brand ~aoe, P. Gaudenzia \naDepartment of Mechanical and Aerospace Engineering, University of Rome “La Sapienza ”, Via Eudossiana 18, Rome, 00184, Italy \nbDepartment of Computer, Control, and Management Engineering Antonio Ruberti, University of Rome “La Sapienza ”, Via Ariosto 25, Rome, 00185, Italy \ncRUAG Schweiz AG, RUAG Space, Schaffhauserstrasse 580, 8052, Zürich, Switzerland \ndThales Alenia Space Italy, Via Saccomuro, 24, Rome, 00131, Italy \neEuropean Space Agency, ESTEC: European Space Research and Technology Centre Keplerlaan 1, 2201, AZ Noordwijk, Netherlands \nARTICLE INFO \nKeywords: \nIndustry 4.0 \nSpace 4.0 \nSmart manufacturing \nCyber-physical systems \nInternet of things \nDigital twin \nArtificial intelligence \nSpace Systems MAIT \nMega constellations ABSTRACT \nIn recent years the so-called New Space Economy or Space 4.0 paradigm has seen a number of new commercial \nplayers entering the satellite industry and creating completely new business models, most of which based on very \nlarge constellations consisting of several hundreds or even thousands of satellites. The production of the high \nnumber of satellites involved in modern mega-constellations is bringing in the space industry the necessity of \nimproved and optimized manufacturing approaches suitable for serial production , i.e., standard environment/ \nhigh number of platforms. In this framework, the adoption of Industry 4.0 methodologies within the space in-\ndustry will lead to a significant improvement and optimization of the whole Manufacturing Assembly Integration \nand Testing (MAIT) cycle. The main aim of Industry 4.0 is the creation of intelligent factories where \nmanufacturing technologies are upgraded and transformed by Cyber-Physical Systems (CPSs), the Internet of \nThings (IoT), Cloud Computing and Big Data Analytics with predictive monitoring features. Main element of the \nIndustry 4.0 approach is the synergic use of embedded sensing technologies in the frame of intelligent production \nprocesses, fostering a radical evolution of the industrial values chains, production value chains, and business \nmodels. In the present work, a possible application of the Industry 4.0 concepts to space industry is presented and \ndiscussed in terms of applicability and obtainable advantages. As a case study, the composite sandwich panel \nmanufacturing line of RUAG Space is considered. Particular focus will be given to the development of a CPS, by \nestablishing a control network of sensors (e.g. temperature, location, load) over a targeted MAIT process. \n1.Introduction \nNowadays, the terms “Industry 4.0” and “Smart Manufacturing ” \nhave become extremely popular to address the so-called Fourth Indus -\ntrial Revolution (4IR) [1] where the evolution of connectivity and \ncomputational calculus permit to create a bridge between physical and \nvirtual worlds. This connection is represented by Cyber-Physical Sys-\ntems, which will be the core of the present study. The same revolution \napplies indeed to the space sector. In 2016, while the European indus -\ntrial context was rushing into the innovation of factories to take \nadvantage of this new concept, the space industry – namely, the Euro-\npean Space Agency as its main promoter - followed through and launched the so-called “New Space ” or “Space 4.0” era [2]. The main \nproblem these ambitious initiatives aim to solve can be synthesized as \nmanufacturing inefficiency in a globalized competitive environment, i.e. \nthe slow operational response to customers ’ complex demand driven by \nincreasing availability of open information. \nThe background scenario to take into consideration to understand \nthis revolution is the worldwide rise of ICTs as a disruptive force of \nchange in any context, even society itself [3]. The faster and easier \navailability of data, as much as the greater reachability of people and \nplaces all over the world both physically and remotely, kicked-off an \nunstoppable globalization driver, increasing competitiveness and \nunlocking new opportunities of sharing knowledge to advance research \nor make a profit. For this reason, the rising demand for greater \n*Corresponding author. \nE-mail address: marco.eugeni@uniroma1.it (M. Eugeni). \nContents lists available at ScienceDirect \nActa Astronautica \nu{�~zkw! s{yo|kr o>!ÐÐÐ1ow�o �to~1m{y2w{m k�o2km�kk��~{!\nhttps://doi.org/10.1016/j.actaastro.2021.12.039 \nReceived 25 November 2021; Accepted 23 December 2021 \nActa Astronautica 192 (2022) 276–290\n277connectivity pushed new commercial players to risk large investments in \nthe space industry with completely new business models [4]. American \nventures and start-ups led the way and invented a new concept of \nexploitation of already in vogue small satellites, putting them in large \nconstellations and in LEO to give high-bandwidth, low latency internet \naccess to remote areas or to gather new data from more frequent or \nhigher quality observations. The market segment grew exponentially in \nthe last decade, with constellations of even thousands of satellites being \nalready in the launch phase. Nowadays 2500 satellites are actively \norbiting around Earth and are expected to be 50 k in ten years [4]. Fig. 1 \nshows the present status of the largest constellations: Space X’s Starlink \nis the most ambitious one, followed by Amazon ’s Project Kuiper. Both \nprojects aim at operating more than 1000 satellites at a time. In the top \nten also Airbus One Web can be found, whose bankruptcy had worried \ninvestors and shareholders, up to its recent rescue by UK government to \nconvert it into a navigation system after the loss of participation in \nGalileo project because of Brexit [5]. Among the well-known English \ncompanies, SatRevolution is an example of a company mainly based in \nEurope. However, all these companies have made international coop-\neration with ventures, billionaires, or big space players to allow the \nrealization of their projects. A significant reduction of costs is therefore \nnecessary for the industry to take advantage of such a promising new \nmarket segment and open it to smaller businesses or more traditional \nmanufacturing players. Producing thousands of satellites of high quality \nand with tighter deadlines then becomes the top priority, thus requiring \nan innovative approach to manufacturing processes, which made Smart \nManufacturing and its related technologies the best available solutions. \nThe biggest challenges in trying to reduce costs while leveraging capa-\nbility are the following: \n≡the increased diversification of requirements asked by customers or \nusers. \n≡the short lead-time to market from product development to product \ndelivery, reduced by global competitiveness. ≡the higher quality assurance needed by more complex new tech-\nnologies [3]. \n≡products reliability, stability, and longevity [6]. \nThe main challenge related to cost reduction is linked with the \nlimited-in-time capital investments, especially concerning the cost of \nlaunch. Developments are being made to make smaller, more flexible \nlaunchers at better prices. Analytics, computing power and AI (Artificial \nIntelligence) algorithms can improve the operations management of \nlarge constellations, reducing response times and operating costs. The \ngoal is to reach the autonomous or semiautonomous spacecraft control \nand management [4]. However, in the space industry the scale of vol-\nume product does not allow for the introduction of automation [7] as \nmuch as it does in mass-market sectors, thus generating the need for \nalternative concepts of product and process optimization, relying more \non IT (Information Technologies) than OT (Operational Technologies) \nor, better, on the integration of both. Even when more easily applicable, \na new wave of automation would require the conversion of blue-collar \njobs to white-collar jobs, with a fast reskilling and new training of \nhuman resources toward greater horizontal connectivity and interop -\nerability [8]. In any case, the space industry needs to become “smarter ” \nand its smartness level will be measured by the degree of reflection of its \nproducts and processes in the new digital world, also called the “cy-\nberspace ”. Competition is not between products or processes anymore, \nbut rather between the information services and analytics algorithms \nbehind them. The solution proposed to convey “smartness ” to the real-\nization of large constellations of small satellites can be borrowed by the \nnewest frontiers of Smart Manufacturing, especially in the framework of \n“Industry 4.0” initiatives spread all over the world [9]. Thanks to the \nprinciples of Smart Manufacturing, it is possible to translate a conven -\ntional in-line dedicated manufacturing process into a fully integrated \ndigitalized process using the latest information technologies. The space \nindustry has not a long experience in serial process optimization, \ntherefore it must take advantage of the state of the art in other industries \nto win the challenges previously mentioned and meet the need for a Acronyms/abbreviations \n4IR Fourth Industrial Revolution \nAGV Automated Guidance Vehicle \nALM Application Lifecycle Management \nAM Additive Manufacturing \nAPM Automated Insert Potting Machine \nASIC Application Specific Integrated Circuit \nCPS Cyber-Physical Systems \nCPPS Cyber-Physical Production Systems \nDT Digital Twin \nERP Enterprise Resource Planning ICT Information and Communication Technology \nIoT Internet of Things \nKET Key Enabling Technology \nKPI Key Performance Indicator \nLEO Low Earth Orbit \nMAIT Manufacturing Assembly Integration and Testing \nMEMS Micro Electro-Mechanical Systems \nMES Manufacturing Executive System \nMOM Manufacturing Operations Management \nNDI Non-destructive Inspection technique \nOT Operational Technology \nUT Ultrasonic Testing \nFig. 1.Pareto chart of planned and launched small satellite constellations per number of satellites as of March 2020. [© newspace.im ]. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n278better capability over cost ratio. It will need to master the latest de-\nvelopments in other industries in the field of Smart Manufacturing and \ntake them to the next level for the first time. Opportunities for stan-\ndardization, modularization and serialization are evident, especially \nbenchmarking with the Do-It-Yourself philosophy that the Chinese \nAerospace industry is trying to pursue with micro and nanosatellites. For \nexample, from a strategic point of view, with respect to German-born \n“Industry 4.0”, Chinese plan (called “Made in China 2025”) has star-\nted with a pilot and then will be extended step by step [10,11]. Their \ngoal is to introduce a comprehensive innovation system on a small scale \nto increase the manufacturing capability index and thus the convenience \nof the product at macro level both for producers and customers [6]. \nBesides the fundamental trend of lowering costs, satellite \nmanufacturing moves toward the concept of universality, that is the \npossibility to leverage international competitiveness for supplies and \nraw materials to set factories for enhanced rapidity, easier maintenance, \nand better upgradability [6]. Following this reasoning, the UK space \nsector in 2019 explored the interesting concept of a “Global Production \nNetwork”, focused on dynamics and thus on the importance of heritage \nto manage associated risks. In particular, the UK sector is trying to \nmitigate risks by counting on well-proven technology and structuring \nsolid relationships with national and international regulators [12]. The \nBrexit and covid-19 pandemic, however, will put the success of this \nviewpoint in doubt for two reasons: first of all, after Brexit the UK will be \nless and less protected by the European Community, being more of a \ncompetitor in the European market, and thus incurring in higher taxa-\ntion for import/export, thus losing the advantage of lowering process \nand product costs [5]; secondly, having disrupted logistics, travel and \ntransportation, the covid-19 and its safety regulations will force Euro-\npean space companies to rapidly invest in new home-made technologies \nin order to keep their workforce and avoid tensions coming from the risk \nof increased unemployment and difficulty to expatriate. \nThe paper is organized as follows: Section 2 explains the theoretical \nbackground necessary to understand the use of cyber-physical systems in \na space factory; Section 3 introduces the approach to its implementation, \nreviewing hardware technology, software technology, sensors systems \nand Non Destructive Inspection (NDI) techniques; in Section 4 the \napproach is applied to the case study on the real process of RUAG’s \nsandwich composite panel manufacturing; eventually, Section 5 pre-\nsents the conclusions of the study. \n2.Theoretical background: cyber-physical systems \nIn this Section, an overview of Smart Manufacturing concepts, tools \nand strategies is presented together with the most adopted SM frame -\nwork, RAMI 4.0, are illustrated. Among SM concepts, the theory of \nCyber-Physical-Systems is highlighted as the foundation for the inte-\ngration of IT and OT enabling the improvement of an MAIT process in \nthe space industry. Concepts, characteristics, and contextualization in a \nproduction environment are given. \n2.1. Smart Manufacturing concepts, tools and strategies \nSmart Manufacturing focuses on establishing intelligent and \ncommunicative systems based on interoperability, i.e. machine-to- \nmachine and human-to-machine interconnections, dealing with a digi-\ntalized data flow from intelligent and distributed system interaction \n[13]. Products, machines, and company processes acquire a higher level \nof knowledge by data acquisition of parameters, e.g. product charac -\nteristics, localization, process parameters (temperature, pressure, speed, \netc.), and also information from the other stakeholders (e.g. customers, \nsuppliers). This data collection is transferred through internal or \nexternal communication networks, to be shared and to enable \nself-control capacities of products, machines, processes. Thus, these el-\nements become “smart”: capable to measure, recognize, communicate, \ncarry out decision-making processes (mostly without man intervention), to activate actions and operations in production [14]. Smart \nmanufacturing in short is “a data intensive application of information \ntechnology at the shop floor level and above to enable intelligent, effi-\ncient, and responsive operations” [15]. To consider a process “smart”, it \nis necessary to satisfy the following characteristics [16]: (I) computeri -\nzation, or the ability to control or monitor operations through pro-\ngrammable logics such as PLC, microcontroller, or microcomputer; (II) \nconnectivity, achieved through communication networks such as 4G, \n5G, Wi-Fi or specialized protocols; (III) visibility; (IV) transparency, \nbuilding an operating history and allowing problem solving based on \nreal data; (V) predictive capacity, adopting models based on algorithms \nthat correlate past operations with the measured real-time parameters; \n(VI) adaptability, allowing the system to adapt its operations. \nSmart Manufacturing strategic action lines are focused to reach im-\nprovements on autonomous interoperability, agility, flexibility, \ndecision-making, efficiency or cost reductions, mass customization, \nservitization [3,17–19]. It enables companies to cope with the chal-\nlenges of producing individualised products as expected by customers \nwith a short lead-time to market and at the cost of mass production [20]. \nSmart Manufacturing relies on the interdisciplinary and complex \nimplementation of several different technologies, such as \nCyber-Physical-Systems, Artificial Intelligence [21], Cloud Computing \n[22], Big Data analytics [23], Machine Learning [24], Internet of Things \n[25], Augmented Reality and Virtual Reality [26], etc. This paper will \nfocus on those selected to the implementation of a CPS architecture in a \ncomplex MAIT process in the space industry. However, a common \nstandard infrastructure is shared among all these technologies, helping \nto contextualize them in the overall product life-cycle value chain: the \nso-called RAMI 4.0 [27]. RAMI 4.0 ensures intercommunication and \nunderstanding across all business units and functions with a \nservice-oriented architecture, starting from physical things and arriving \nto the most digital business processes through a bi-dimensional hori-\nzontal and vertical expansion, following respectively the increase of \nvalue and the increase of authority, see Fig. 2. RAMI 4.0 well represents \nIndustry 4.0 concepts of holistic integration as well as easy interopera -\nbility, modularity and reconfigurability, bringing them directly in the \nstructure of the business, sometimes called enterprise, for its compre -\nhensive service-oriented goals. Being RAMI 4.0 such a complex archi -\ntecture, a hybrid model with the upper layers substituted with \ntraditional MES and/or ERP is under study to fasten its implementation \n[28]. Among all SM tools, the CPS has the best potential to reproduce \nthis framework, being the only one able to also integrate all other \ntechnologies. \n2.2. Cyber-physical systems applied to a manufacturing environment \nRecently, there has been an explosive growth in the development and \nimplementation of various Cyber-Physical Systems (CPS) [29]. CPS \n(cyber-physical systems) are physical systems that incorporate in-\ntegrations of computation, networking-communication, and physical \nprocesses control, see Fig. 3. They are made of heterogeneous cooper -\nating components interacting through a complex, coupled physical \nenvironment operating over many spatial and temporal scales [30]. \nEmbedded computers and networks monitor and control the physical \nprocesses, with feedback loops where physical processes affect compu -\ntations and vice-versa. CPS are defined as transformative technologies \nfor managing interconnected systems between their physical assets and \ncomputational capabilities [31]. CPS are systems of integrated compu -\ntational entities which are in intensive connection with the surrounding \nphysical world and its on-going processes, providing and using, at the \nsame time, data-accessing and data-processing services available on the \nInternet [32]. In other words, CPS can be generally characterized as \n‘‘physical and engineered systems whose operations are monitored, \ncontrolled, coordinated, and integrated by a computing and communi -\ncating core’’ [33]. To this end, CPSs are able to Ref. [34]: M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n279(i) collect data referred to themselves and their environment \n(ii) process and evaluate these data \n(iii) connect and communicate with other systems \n(iv) initiate actions. \nA CPS is defined as a system in which physical objects are required to \nbe accompanied by their representation in the digital world, to be in-\ntegrated with elements with computing, storage, and communication \ncapabilities, and to be networked between them. They are considered \none of the key technological innovations (Key Enabling Technology - \nKET) of the Fourth Industrial Revolution, a transformative technology \nthat can be placed in the foreground for the potential promised for the \ncreation of value along with the three dimensions of the digitalization of \nmanufacturing: the smart product, Smart Manufacturing, and changes in \nthe business models of companies [35]. Smart manufacturing systems \nuse CPS predominantly as a tool to monitor the physical world and make \ndecentralized decisions in the virtual world, often referring to Cyber-Physical Production Systems (CPPS). The growing availability, \naffordability and adaptability of sensors and connection systems are \nincreasing the widespread adoption of CPS and CPPS. Production data \nare easier to be collected and transferred to cloud platforms, where \nanalytics and AI tools permit to analyse and predict the production be-\nhaviours, and consequently act (manually or automatically) to increase \nperformance. A complete CPS should be able to get information from the \nphysical world and act on it, usually after data computations suggested \nthe action to be implemented. CPS should not be confused with IoT, \nbecause IoT is part of a CPS system, that for example could also include \nAI technology. Some insights on these technologies can be found in \nRef. [36]. Fig. 4 shows how CPPS connect a system in the physical world \nand its Digital Twin (in the cyber world), with an important remark \nabout the human-centred vision of these systems. Indeed, the oper-\nator/manager is always needed to check the process reliability and often \nto validate the analysis and the actuating decisions. In the design of a \nCPS it is recommended by Ref. [30] to pay attention to issues of \nFig. 2.RAMI 4.0 architecture is the most common standard framework for the application of Smart Manufacturing to a whole enterprise value chain. The archi -\ntecture is structured on a bi-directional and multi-layer way, with developments going both horizontally, following product life cycle value (procurement to sales) \nand hierarchical levels of complexity (product to connected world) and vertically, expanding from the simple asset (e.g. shop floor equipment) to the entire busi-\nness [27]. \nFig. 3.The figure shows a layout of the Cyber- \nPhysical System of a sensorized MAIT process plant. \nIt illustrates the cycle from physical to cyber domains, \npassing by control, communication and computation \nfunctions. In the computational layer, data records \nand analysis are performed. The Digital Twin re-\nproduces the process plant in the Cyber Domain, \nwhile the Internet of Things allows its communication \nwith the physical domain through the interconnection \nof sensors in an online platform. Eventually, intelli -\ngent analytics can be performed by AI algorithms \nintroduced in the computation phase and aimed at \nimproving the data reports, allowing faster decision- \nmaking, possibly made autonomously or semi- \nautonomously by the process machines themselves. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n280reliability and security, level of abstraction and architecture styles for \nmodular design and development, new frameworks and algorithms, \nconcepts of dependability, reconfigurability, certifiability and trust-\nworthiness. More research on this topic can be found in Refs. [37,38]. \n3.Approach: implementing a CPS architecture in space \nmanufacturing \nThe main problem a space factory nowadays faces is related to the \nhigh costs of keeping the pace of a competitive technological market, \nleading companies worldwide with the help of new business models to \nlower entry barriers to the segment. Technological innovation inte-\ngrating the newest IT solutions is requested to traditional manufacturing \nshop floors to leverage space long-term heritage while keeping the \nbusiness sustainable. The CPS was chosen among all SM tools, according \nto the features described in the previous chapter, as the best candidate to \ngive a measurable and reliable improvement to a space manufacturing \nprocess. Introducing a CPS into a space manufacturing facility requires a \ntwo-level approach: \n1. Monitoring the product to be manufactured. \n2. Monitoring the production, integration and test means necessary to \ndeliver the product. \nTo fully realize this approach, three main areas of technical \ncompetence have been considered: \n● Hardware technology, to identify the critical operations of a complex \nMAIT process; \n● Software technology, to identify the most performant solutions to \ndigitalize the process; \n● Sensor systems and Non-Destructive Inspection (NDI) techniques, to \nidentify types of sensors and related techniques to enhance product \nand process control and monitoring. \n3.1. Hardware technology \nIn this paragraph an overview of the applications of SM tools from \nthe point of view of Hardware Technology is given. First, the illustration \nof typical production systems will explain the convergence toward the \ncellular system. Then, traditional production characteristics in the space \nindustry will be mentioned and their evolution following SM principles \nfrom the point of view of HW technology will be presented. \nConcerning production systems, the aerospace industry is mainly \ncharacterized by intermittent production and the management of the production is typically based on job-shop criteria [39]. This system type \nis characterized by low volume and high variety with relatively low \nproduction rate and high flexibility. It is also noteworthy that the \nplanning, routing, and scheduling function is typically done for each \npart independently. The efficiency of the machines is low and, to reduce \ncost, they are general purpose machines. The machines and the move -\nments are reduced, and few setup operations are required. On the con-\ntrary, in large-scale productions machines are dedicated, and processing \nparameters are optimized for few types of parts. A continuous flow must \nbe maintained. In this case, high costs and highly specialized machines \nare affordable thanks to the large production volume. This type of \nproduction system is referred to as ‘process-based ’. The addressing of \nresources is completely dedicated to the optimization of specific pro-\ncesses and the routing of the single part reflects the sequence of the \noperations over the selected machine. As a result, the movements are \nmany and the mean lead time is affected. Between these two extremes, a \n‘combination layout ’ is usually proposed in industrial manufacturing. It \nis the so-called ‘cellular production ’ that requires a systematic approach \nin the design methodology that incorporates all the previous benefits \nand can easily move between the extremes, see Fig. 5 [39]. The benefits \nof the cellular production system are widely accepted in industrial \nproduction for the so-called mass customization, but many items must \nbe considered in the space industry. It is particularly important to \nmaintain the quality assurance of the fabricated components and it is \ndifficult to allow the automation of labour-intensive operations and \ncombinations between process options. \nTraditionally, space production systems, besides being of “job-shop ” \ntype, were mainly designed for single units. In Ref. [24] the example of \nBoeing is presented: the focus was on single unit delivery models and \nunique parts were supplied by customized contracts with suppliers \ncoming only from the space industry, with prototypes being qualified on \ndemand. Other traditional features included: (I) most of the documen -\ntation produced and archived in paper; (II) a low presence of automation \nor robotization; (III) single shift/5 days schedule; (IV) long life-cycle \nproducts of typically 10 years; (V) siloed structures for the different \ndepartments; (VI) “push ” approach with large stock of finite product \n[24,40–42]. Most of these characteristics evolved in the framework of \nIndustry 4.0 and Space 4.0 initiatives. The following interesting SM \nconcepts have been applied to HW technology, specifically in the \ncontext of small satellites ’ constellations [43]: \n● Automated Guidance Vehicles (AGVs) \nFig. 4.An example of the implementation of a cyber-physical system in the \nproduction department. The job flows from production orders to machines, \nwhile the decisions rise from machines back up to customer ’s orders. At every \nstage of data gathering and processing, human intervention is always necessary \nto provide advanced monitoring functions and interpreting results [91]. \nFig. 5.Types of production systems in terms of volume & variety and flexibility \n& efficiency. At the extremes, job-shop system qualifies as high variety and high \nflexibility and process-based system as high efficiency & high volume. The \nhybrid type cellular system lies in between. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n281Equipped with cameras and navigation software, these vehicles \nallow the transportation of heavy components or the final assembly \nthrough the factory. Well known in the automotive industry, this level of \nautomation was used by OneWeb facility in Florida. \n● Spring-based loading machines \nSpecific machines equipped with springs are used to load satellites to \navoid human non ergonomic operations. In general, flexibility of \norientation and vertical movement is required by satellite platforms to \nallow the last operations, when most subassemblies are completed and \nreaching parts is more difficult. \n● Additive Manufacturing (AM) \nAM is based on a layer-by-layer addition of material instead of \ntraditional machining ’s material-removing approach, thus allowing the \nrapid prototyping of even complex geometries thanks to advanced 3D \nsoftware design (for this reason, the technique is also called 3D printing \n[44]). A 3D printing machine was used by Telesat ’s facility in Ottawa, \nCanada, to realize the apertures of the phased-array antennas. This \nallowed the reduction of multiple part numbers into a single standard \none, besides a significant acceleration of times and reduction of costs. \nThe main limitation of the AM manufactured part is of comparable low \nstrength and associated quality, coupled with a high cost of the printing \nmachine system [45]. \n● Robots & cobots \nMultiple robotic solutions were applied for example by Telesat to \nmake repetitive and heavy operations easier, from manipulation of parts \nto cutting. However, these were used only to make prototypes, as the \nmass production is yet to come. The new frontier of robotization con-\ncerns “cobots ”: interconnected and easily programmable; autonomous, \nflexible, and collaborative; able to avoid collisions based on pre-set up \n360•visualizations of the environment; easily programmable [46]. An \nexample of learning cobots for painting can be found in Bombardier \n[47]. \n3.2. Software technology \nStarting from the traditional manufacturing data management sys-\ntems and passing by the concepts of interoperability, given by the In-\ndustrial Internet of Things (IIoT), and digitalization, given by the Digital \nTwin (DT), in this paragraph the most used CPS architecture will be \npresented. \nMany industries adopt the HMI-SCADA System, a comprehensive \nreal-time data control hardware and software architecture for \nManufacturing plants [48]. The Supervisory Control And Data Acquisi -\ntion system (SCADA) represents the overall control system, gathering \nand analysing data in real-time, while the Human Machine Interface \n(HMI) is the software showing data in a digestible format for humans \nthrough computing systems, allowing the interoperability of workers \nand machines. Interacting with equipment through user-friendly SW \ninterfaces, humans can reduce repetitive, unsafe, and heavy work or \nfacilitate their day-to-day process monitoring activities. The \nHMI-SCADA System architecture is based on executive functions and \ncommunicating functions. The executive functions are represented by \nthe field instrumentation (in-house instruments monitoring and con-\ntrolling automation processes) and Remote Terminal Units (RTU) or \nProgrammable Logic Controllers (PLC), whose concepts are mostly \noverlapped and represent the interface between plant equipment and \ntheir computing control units. The communication functions, on the \nother hand, are represented by a data communication layer, transferring \ndata from the plant to the server; a telemetry layer, transmitting and \nreceiving data from external sources (e.g. Earth telecommunication stations or satellite ground stations); the SCADA host or supervisory \nsystem, including the HMI software, representing the data receiving \nserver. In Fig. 6 the system is vertically contextualized as a level of the \noverall complex enterprise system standardized by the ISA95 model \n[49], including the device level at the bottom and the management \n(MOM or MES) and enterprise (ERP) interfaces at the top, the last two \nrepresenting the data analytics and integration platforms. Enterprise \nsystems (ES) or enterprise information systems (EIS) concepts have been \nresearched and utilized for decades, with applications in the aerospace \nindustry being studied at [50]. In some industries, like the pharmaceu -\ntical one, the application of this standard from day one allowed the \ntransition to paperless processes [51]. \nOver the next ten years, the number of connected devices will exceed \nthe number of inhabitants of the world [33]. The IIoT represents a \npossible evolution or integration of the HMI-SCADA System in the new \nindustrial landscape. The IIoT is defined as a network of physical sys-\ntems that can interact with each other thanks to standard communica -\ntion protocols, to achieve a common goal. Physical systems, and \ntherefore ’things ’, are represented by sensors, actuators, communication \nmodules and devices that can collaborate with each other, through \nintelligent components and applied software, and therefore achieve \nobjectives that strongly depend on their ability to transmit and process \ninformation. It is a multi-directional communication between processes, \nincluding the machinery used, the components and the products. The \nmain form of communication allowed by IIoT technology with respect to \nSCADA/HMI is machine-to-machine communication: the devices \ncommunicate directly using programmable electronic devices and \nwireless technologies. This form of interoperability among machines \ncould extensively contribute to the implementation of a CPS architec -\nture. Other recommendable IIoT characteristics are self-optimization, \nself-healing, self-configuration, and self-protection [52]. A use-case of \nIIoT-based architecture applied in aerospace manufacturing can be \nfound in Ref. [53]. \nTo implement a CPS, process physical entities must also have a \nfaithful representation in the digital world. This representation is \ndefined as ‘digital twin ’ (DT). DTs are commonly known as a key enabler \nfor the digital transformation in manufacturing. Different definitions \nagree on features such as (i) connectivity, i.e., the ability to communi -\ncate with other entities, (ii) autonomy, i.e., the possibility to live inde-\npendently from other entities, (iii) homogeneity, i.e., the capability, \nstrictly connected to autonomy, that allows using the same DT regard -\nless of the specific production environment, (iv) easiness of custom -\nization, i.e., the possibility to modify the behaviour of a physical entity \nby using the functionalities exposed by its DT, and (v) traceability, i.e., \nthe ability to trace the activity of the corresponding physical entity. To \nallow traceability, systems based on barcodes, QR codes or RFIDs [54] \nare applied or incorporated in the product. Finally, DTs monitor and \ncontrol the physical entities, where physical entities send data to update \nwhat are commonly referred to as the virtual models [55,56]. Many are \nthe advantages of this concept. First, it is easily useable for small series \nof customized products. Secondly, the DT allows modular simulation: \nbeing able to reproduce the operating system, it allows to modify \nproducts in a flexible way and to speed up innovation processes. The \npossibility of minimizing the time between design and product delivery \nthrough a DT is a good alternative not to change the process itself, which \nis often more complicated and more expensive. What a DT facilitates \nthat other technologies are not able to is the real time reproduction of \nthe system. Real time is a key concept in process monitoring, as the \nevolution of industrial trends follows speed, with dynamic systems \nhandling high volumes of data [57], also thanks to the introduction of \nnew semiconductor materials which can fasten electronical connections \nof process equipment and information systems. A challenge to consider, \nespecially when scaling the concept to a whole process, is the risk to \ndesign closed cycles, with monitoring functions heavy dependant on the \ndigital reproduction itself. Simulation models made of DTs are able to \nembrace the entire value chain and the entire life cycle of the products, M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n282thus providing the necessary parameters not only to make fast and \nshort-term decisions, but also to allow more sustainable decisions in the \nlong-term, using the permanent collection of data through historical \nseries, which become rich material for statistical models to build more \naccurate correlation coefficients and to show more complete predictive \ngraphical instruments for trends ’ interpretation. \nThe integration of the SCADA/HMI level with the machine-to- \nmachine communication characteristic of the IIoT, linked to a 3D real- time throughout the process virtual representation of all sensors and \nmachinery using DTs, allows the implementation of a fully compre -\nhensive CPS architecture. The so-called CPS 5C level architecture [58] \nclearly defines, through sequential activity flows, the architecture of a \nCPS starting from the initial data acquisition, up to the creation of final \nvalue. The architecture is characterized by five levels: 1. Smart \nConnection Level: guarantees the timely and reliable acquisition of data \nfrom sensors, controllers or company production systems (e.g. ERP, \nFig. 6.Pyramidal architecture of an overall enterprise SM standards ’ system, showing the incorporation of HMI/SCADA level [53]. \nFig. 7.CPS 5-levels architecture is the most used. Levels of connection, conversion, cyber, cognition and configuration are shown. Related assets, users, and functions \nare displayed. [58]. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n283MOM, MES). It is central, considering the heterogeneity of the data, to \nselect appropriate data acquisition methods and sensors (in terms of \ntypes and specifications). 2. Data-to-Information Conversion Level: \nconverts the data collected into significant information through specific \nalgorithms and analysis. 3. Cyber-Level: acts as a central hub, where all \nthe information deriving from the various machines and components, \narrives and creates an intelligent network. They are then analysed to \nunderstand specific or collective information about the state of the \nsystem and evaluated to predict future events. 4. Cognition-Level: the \nimplementation of the CPS at this level generates an in-depth knowledge \nof the monitored system, a valuable support in the decision-making \nprocess. This knowledge allows operators to manage the system opti-\nmally. To ensure visibility, clarity, and immediacy in the understanding \nof the system by the operators, it is often necessary to implement graphic \nanalysis and representations. 5. Configuration-Level: the configuration \nlevel constitutes the feedback of the cyberspace in the physical space and \nacts as a supervisory control to make the machines self-configuring and \nself-adaptable. It acts as a resilience control system (RCS) and allows to \nmonitor, prevent, and correct the systems. This 5-level framework is by \nfar the main reference for CPS. Fig. 7 represents its levels and functions. \nFuture developments of CPS use the 5G protocol network, aiming to low \nlatency (ms) and high data rates (Gbps) [59]. With 6G, already started to \nbe studied, even Tbps could be reached [60]. However, it is estimated \nthat 6G will not be implemented until 2030 [61]. \n3.3. Sensor systems and non-destructive inspection techniques \nSensor systems and Non-Destructive Inspection techniques are \nreviewed as suitable to be integrated in the framework of a CPS to \nimprove the process by faster and more qualitative structural health \nmonitoring. Specific references to advantages in the space industry are \nmade. \n3.3.1. Fiber optics sensors \nThe principle of fiber optics sensors is that of an input light reflected \non a fiber and showing an interference pattern passing by a light de-\ntector. Fiber optics sensors can measure all traditional sensed parame -\nters in structural health monitoring (e.g. strain, temperature, crack \npropagation, leakage, corrosion). Fiber optic sensors have numerous \nadvantages for application in aerospace. In fact, they are lightweight, \ncan be easily embedded into composite structures and are immune to \nelectromagnetic interference. Furthermore, considering that a huge \nnumber of sensors will be necessary to completely cover the structural \nelements of a space structure, the multiplexing capability of optical fi-\nbers, that is the possibility of writing several sensors into one single \nfiber, results in a notable advantage both in terms of low complexity and \nlow weight. The fact that optical fibers do not involve any electric signal \nis a clear advantage from a safety point of view. Other advantages are \nthe long-term stability, low signal losses, the ability to operate in a wide \nrange of temperatures. Drawbacks using these sensors are the difficulty \nin replacing or repairing the fiber if it fails and some technological dif-\nficulties at cryogenic conditions such as low response time for hydrogen \nsensing and low sensitivity for temperature measurements. One of the \nmost interesting applications for small satellite manufacturing is that it \nis possible to monitor the degree of cure by simply measuring the \nrefractive index changes in isothermal conditions [62]. Optical fibers \ncan also be employed for chemical sensing during the cure of composite \nmaterials. At NASA-Langley chemical spectra were obtained using single \nmode optical fibers [63]. \n3.3.2. Acoustic emission sensors \nAcoustic emission (AE) sensors resort to the analysis of emissions \nfrom active defects and are sensitive to defect activity when a structure \nis loaded either during service or a proof test. AE analysis is a useful \nmethod for the investigation of local damage in materials. It is also \npossible to observe damage processes during the entire load history without any disturbance to the specimen. Acoustic emission sensors are \nused for monitoring a wide number of defects in materials such as dy-\nnamic strain, crack growth, leakage, corrosion, delamination, fiber \nbreakage. They are particularly suitable for monitoring the material \nfatigue behaviour since dynamic strain is measured. Conventional \ntechnologies used for AE monitoring are piezoelectric transducers, but \nfiber optic-based AE sensing technology is gaining more and more \nconsideration for the already mentioned advantages related to the use of \noptical fibers [64]. In-flight AE sensors have been successfully demon -\nstrated on the DC-XA in-flight experimentation vehicle. The AE moni -\ntoring system was conceived to have information on temperature limits, \nvibrations, noise characterization and to provide in-flight data from the \nLH2 cryogenic tank. The control unit AEFIS [65] (Acoustic Emission \nFlight Instrumentation System) was able to monitor and send informa -\ntion to the on-board computer for real-time monitoring. The system was \nalso conceived for active monitoring through excitation of the acoustic \nemission sensors. A health monitoring system with 48 sensors for strain \nand hydrogen monitoring was used on the composite hydrogen tank for \nthe X-33 experimental vehicle during on-ground tests. In addition, AE \nsensors for high temperatures have been developed for the structural \nmonitoring of the nose TPS on the X-38, now cancelled. Acoustic \nemission sensors have also been successfully applied during static tests \nof the X34 composite wing. \n3.3.3. Piezoelectric materials \nPiezoelectric materials [79] are composite materials with incorpo -\nrated electrical connections. Under the application of stress, their elec-\ntrodes are excited and the material is charged. Moreover, it manifests a \nlinear change in shape. Charge and linear change represent the char-\nacteristics of such materials in terms of their dual use as actuators \n(transforming electrical energy into mechanical energy) and sensors \n(detecting possible defects measuring structural variations). The most \ncommon family of piezoelectric materials is the so-called PZT (zirconate \ntitanate family). Used as actuators, PZTs sensors can actively monitor \nthe structure. Functioning as both transmitters and receivers, they can \nbe part of a flexible structural health monitoring system capable of \nperforming several evaluation functions. Presently, they can be used for \nactive damage detection with high-frequency electro-mechanical \nimpedance method, or active damage detection with the pulse-echo and \npitch-catch techniques using Lamb-waves, or as passive sensors for \nlow-impact damage detection and acoustic emission detection [66]. \nFurthermore, they can be used in a phased array of sensors that allows, \nthrough the superposition of the generated waves, to focus or steer the \nbeam in a specific direction. Several studies have demonstrated the \ncapability of piezoelectric sensors for damage detection in composite \nmaterials. Studies at ONERA have demonstrated that Lamb waves are \nsensitive to debonding caused by low impact in a sandwich structure \n[67]. \n3.3.4. Micro-Electromechanical Systems (MEMS) \nMicro-Electromechanical Systems (MEMS) are thin-film devices \nproduced through photolithography and chemical etching. Sensors for \ntemperature and pressure measurements are already available as com-\nmercial off-the-shelf products, but other MEMS sensors exist such as \naccelerometers, gyros, acoustic emission, and chemical sensors. The \nadvantage of using MEMS sensors is their small size and potentially low \ncost. They can be easily embedded or surface bonded. Furthermore, with \nan ASIC (Application Specific Integrated Circuit) technology, it is \npossible to create a microsystem of different sensors in one single chip \n[68]. Some issues for structural health monitoring of aerospace struc-\ntures are the temperature range that goes to the best from \u000050 •C to \n175 •C. Furthermore, the temperature dependency of some sensors \nmay affect the measurements, thus limiting the performance [69]. \nDevelopment is required to attain space qualification, and most of all, \nthese devices should be tested in real environment conditions. Another \nissue is the packaging of MEMS sensors. As an example, a smart layer M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n284composed by PZT sensors developed by Acellent Tech. Inc. has been \nembedded into a composite laminate that was also equipped with an \nelectromagnetic layer to measure electrical resistance properties. \n3.3.5. Self-monitoring materials \nSome structural materials can be used as self-monitoring materials, \nwhich means they can sense their own strain and damage by measuring \ntheir electrical resistance [70]. Carbon fiber-reinforced polymers are \nvery suitable as self-monitoring materials since the fibers are electrically \nconducting and the electrical properties of the material are sensitive to \ndamage. Self-monitoring materials are intrinsically smart, which means \nthey don’t need embedded or attached sensors, so they have some ad-\nvantages like low cost, simple design, great durability, large sensing \nvolume and absence of mechanical property degradation due to \nembedding of sensors. A problem of concern for electrical measurements \nis the electrostatic disturbance due to the electrical charging of the \nstructure when flying through charged atmosphere at high speeds or in \norbit due to encounter with ionized molecules. Another issue to be \naddressed is the ability to locate the damage in large composite struc-\ntures. As an alternative, CFRP self-healing materials are under study \n[71], having the advantage of using a new ISOX (iso-\ncyanurate-oxazolidone) thermosetting matrix able to restructure itself in \ncase of delamination or debonding (fiber breakages are not detectable \nthough). \n3.3.6. Thermocouples, strain gauges and accelerometers \nTogether with new sensing technologies such as optical fibers, pie-\nzoelectrics and so on, conventional sensors are also used for structural \nhealth monitoring in the space industry. The major issues for sensors \nsuch as thermocouples, strain gauges and accelerometers are the weight \npenalty from the sensor itself, but also from the wires required to pro-\nvide power and data communication. Wireless transceivers can be used \nto overcome this penalty. These have been flight tested at NASA in the \nframe of ARIES experiment as part of an Integrated Vehicle Health \nMonitoring architecture [72]. The transceivers radio frequency emis-\nsions have been demonstrated to not have interference with communi -\ncation and navigation antennas. \n3.3.7. Thermography \nThermographic methods are non-destructive inspection methods in \nwhich the presence of flaws is determined by monitoring the flow of heat \nover the surface of a structure after some external introduction of a \ntemperature gradient [73]. The presence of flaws disrupts the normal \npattern of heat flow that would be expected in a sound structure. The \nmethod is more sensitive to flaws near the surface. Modern thermo -\ngraphic systems commonly use infrared (IR) cameras to detect radiated \nheat and are controlled by TV video electronics which sample the field of \nview at a typical rate of 50 Hz, allowing temperature variations on a 20 \nms timescale to be resolved [74]. The camera is sensitive to temperature \nchanges of about 0.005 •C and covers a chosen range of temperature, \n4 •C and 8 •C being commonly suitable, although operation is feasible \nbetween \u000050 •C and 100 •C. Liquid crystal coatings and pyroelectric \ndetectors have also been used to detect IR radiation. Thermographic \nmethods fall broadly into two groups: active methods, and passive \nmethods. Active methods are those in which the thermal gradient is \nproduced and continuously maintained by the application of cyclic \nstress. An interesting application of IR thermographic technique is the \ninstallation of a thermo-camera to an unmanned aerial vehicle for the \nmonitoring of defects at the distance of 2 m and 6 m [75]. Passive \nmethods are those in which the thermal gradient results from a transient \nchange. Passive methods are the most widely applied NDI techniques in \ncomposites inspection. Also, non-IR conductive thermography has been \napplied to aerospace applications, such as in the field of Maintenance, \nRepair and Overhaul (MRO), being able to identify defects in a laminate \ncomposite at low temperature [76]. 3.3.8. Ultrasonic testing \nUltrasonic testing (UT) is the most widely used non-destructive in-\nspection method for the examination of composites [77]. On micro -\nscopically homogenous materials (i.e. non-composite) it is commonly \nused in the frequency range 20 kHz to 20 MHz. With composite mate-\nrials the testing range is significantly reduced because of the increased \nattenuation, so the operating frequency limit is usually 5 MHz or less. \nHowever, the ability to resolve small flaws will also be reduced. In most \ntechniques, short pulses of ultrasound (typically a few microseconds) are \npassed into the composite material and detected after having interro -\ngated the structure. The techniques include pulse-echo, through- -\ntransmission, back-scattering, acoustic-ultrasonics, and ultrasonic \nspectroscopy. In these methods, it is important to avoid frequencies at \nwhich resonance occurs between ply interfaces. For unidirectional plies \nspaced at 8 plies/mm this frequency is usually about 12 Mhz. There may \nbe an additional resonance for woven fabrics at approximately 6 Mhz for \n0.25 mm plies, although resonance at other frequencies has been seen in \npractice. Different approaches can be used: manual, immersion, and \nlaser testing. Moreover, an example of combination of UT and conven -\ntional IR thermography techniques is presented in Ref. [78], using car-\nbon/epoxy patches bonded on an aluminium plate and producing fusion \nalgorithms correlating both inspection results. \nSensors and NDI techniques above mentioned can be object of trade- \noff analyses to improve space manufacturing processes according to \ncustomers ’ requests, mainly to increase factories ’ KPIs like Quality of \nService (QoS) and defects rate. In the following chapter the case study of \na real space manufacturing process includes the assessment on the use of \nsome of these technologies. \n4.Case study: RUAG ’s composite sandwich panel manufacturing \nAs a case study, RUAG ’s composite sandwich panel manufacturing \nprocess was taken in consideration. Panel manufacturing today is still a \nlargely manual process. This is especially valid for large, non-serial \nspacecrafts for scientific missions. With the establishment of constella -\ntions during the last years, considerable effort was made to industrialise \nthe overall manufacturing process. Still, the state-of-the-art \nmanufacturing process is distant from an Industry 4.0 philosophy. \nHere follow the main process areas (according to the job-shop pro-\nduction system), each of which is made of stations, operations and \nphases: \n- Parts preparation: procured and stored parts (cut-to-shape \naluminium face sheets, already expanded aluminium honeycomb \ncore, adhesives, foams, inserts and heat pipes) are machined and pre- \nassembled. Parts whose surface is destined to external exposure are \ntreated under galvanic bath to prevent corrosion. \n- Panel assembly: the pre-assembly is bonded under hot press. \n- Panel inspection and testing: Non-Destructive Inspections (NDI) \ntechniques (e.g., Ultrasonic Inspection - UT) and testing (e.g., flat-\nwise tensile strength) are performed. \n- Panel equipment: hot-bonded inserts are automatically potted, and \ncold-bonded inserts are machined; thermal equipment (e.g., paint \nand heat pipes) is integrated. \nFor a summary of existing sensors or automated equipment deliv-\nering process data, with related measurement properties and units the \nreader can refer to Table 1 \nThe general approach to industrial panel manufacturing varies at \ndifferent points compared to the more traditional solution. With \nindustrial-based manufacturing, materials and processes are tailored to \nthe product itself. In the case of the sandwich panels, this means that \nface sheets are already procured cut to shape. Furthermore, time- \nconsuming processes are being automized, as for instance the bonding \nof inserts. A two-level approach has been considered to improve the \nprocess as shown in Fig. 9. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n285First, existing data must be collected, categorized, and interpreted, so \nthat bottlenecks or shortcomings can be more easily identified. Different \ntypes of sensors and non-destructive sensing techniques were identified, \nsee Fig. 10(a) and Fig. 10(b) showing how a sensing network is deployed \nover the observed process. The collection and categorization of data will \nbe possible thanks to the improvement of the built-in traceability sys-\ntem, extending it to the whole process and introducing an IoT infra-\nstructure based on a sensors network and a data processing and analytics \nplatform. Process parameters like pressure, humidity and temperature \nare tracked, as well as product part numbers; optical imagery aids \nquality control, sound alarms on thresholds helps day-by-day opera -\ntions; all these functions (and more programmable ones according to \nproduction needs) are so interconnected and easily monitorable by a \ndashboard by means the software architecture shown in Fig. 10 (c)). \nObservations are used to perform an AS-IS analysis about data \ncollection within the case process. The Acatech maturity model is chosen \nas a foundation for the development of a new assessment model to \nrepresent the current smartness level of the process. The new assessment \nmodel is based both on the evaluation of single activities, which is \ncrucial to thoroughly verify every operating step of the process, and on \nthe assessment of the whole process, which allows to identify transversal \nintegration elements, which would be otherwise scarcely visible. The \nfirst assessment focuses on the single activities. This process is based \nupon a customization of the Acatech model, which ensures a digital \nmaturity level assessment comprising six maturity stages: from a not- \ndigitalized company to a company with all the features of Industry \n4.0. This model was adjusted to the objective of the assessment, i.e. to \nmeasure the smartness of the process in terms of data collection, and to \nassess single activities. In particular, a qualitative assessment was per-\nformed, and the achievement of a smart level was evaluated according to \nthe maturity model ’s features of computerization, connectivity, visibil -\nity, transparency, predictive capacity and adaptability, see Sec. 2.1 and \nFig. 8. \nAn analysis of possible gatherings of new useful information by new \ntechnologies or new stations can be conducted once the already avail-\nable are collected and analysed by means a suitable software and \ncomputing infrastructure. In case the interpretation of data executed at \nstep 1 needed a deeper insight or critical points in the process were \nidentified, some new technologies should be added accordingly. One of Table 1 \nAs-is process: sensors and automated equipment with related measurement pa-\nrameters and units. © RUAG Space. \nPROCESS \nSTATION/ \nOPERATION/ \nPHASE SENSOR/ \nEQUIPMENT MEASUREMENT \nPROPERTY MEASUREMENT \nUNIT \nParts preparation/ \nPanel milling Laser External dimensions \n(lenght, width, pocket \npositions) Mm \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Timer Time of bath S \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition pH \nParts preparation/ \nFacesheet \nbonding surface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition Concentration \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Timer Time of bath S \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition pH \nParts preparation/ \nInsert bonding \nsurface \npreparation/ \nGalvanic bath Sensor Chemical \ncomposition Concentration \nParts preparation/ \nAdhesives/ \nIncoming \ninspection Tensile \ntesting \nmachine Lap shear strenght Mpa \nParts preparation/ \nAdhesives/ \nStoring Timer Storage time S \nSandwich \nassembly/ \nSandwich layup Laser Alignment Mm \nSandwich \nassembly/Panel \nbonding Hot press Pressure Bar \nSandwich \nassembly/Panel \nbonding Hot press Temperature •C \nSandwich \nassembly/Panel \nbonding Hot press Time S \nPanel inspection \nand testing/ \nUltrasonic \ninspection Sensor Panel defects \n(delamination, \ninhomogeneity, \nbonding defects, etc.) dB \nPanel inspection \nand testing/ \nFlatwise tensile \ntest Tensile \ntesting \nmachine Tensile strenght Mpa \nPanel inspection \nand testing/3- \npoint and 4- \npoint bending \ntest Tensile \ntesting \nmachine Bending strenght Mpa \nPanel inspection \nand testing/ \nThermal cycling Thermal \nchamber Outgassing % \nPanel equipment/ \nInsert potting APM Insert-injected \nadhesive mass G Table 1 (continued ) \nPROCESS \nSTATION/ \nOPERATION/ \nPHASE SENSOR/ \nEQUIPMENT MEASUREMENT \nPROPERTY MEASUREMENT \nUNIT \nPanel equipment/ \nInsert potting APM Adhesive mixing ratio % \nPanel equipment/ \nInsert potting APM Insert height w.r.t. \nfacesheet Mm \nPanel equipment/ \nInsert potting APM Insert angle w.r.t. \nfacesheet Rad \nPanel equipment/ \nInsert potting APM Insert position Mm \nPanel equipment/ \nAdhesive curing Sensor oven Curing temperature •C \nPanel equipment/ \nAdhesive curing Sensor oven Curing time S \nPanel equipment/ \nInsert proof-load \ntest Sensor Load-displacement \ndiagram N/mm \nPanel equipment/ \nInsert pull-out \ntest Sensor Pull-out load N \nPanel equipment/ \nHeater bonding Laser Position Mm \nPanel equipment/ \nMLI bonding Testing \nmachine Bonding strenght Mpa \nPanel equipment/ \nTie-base bonding Testing \nmachine Bonding strenght Mpa M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n286the main process phases to concentrate on to add information is quality \ntesting. Quality testing usually requires long times and heavily impacts \nboth the technical and economic aspects of the process. Making it more \nagile and automating it would fasten the process and make the testing \nitself more accurate thanks to incorporated statistical models. In our \ncase study, testing stations J4 and J5 right after panel bonding (process \nstep 11) and J2 just after panel machining and inserts installation by \nmeans of RUAG ’s fully automated APM technology (process step 13) is \nof particular interest for future developments. \nThe method proposed to assess the AS-IS process status will be \napplied to understand the steps required to reach the desired “smart ” \nlevel, in terms of individual activities, and to understand how to generate a greater level of interconnection and be able to monitor a \ngreater number of performances. If, for instance, the intention is to \nguarantee that the available data generates an “Enterprise ” level of \ninterconnection throughout the entire process, see Fig. 6, it would be \nnecessary to guarantee a circulation of data that goes beyond the com-\npany ’s internal borders, in an extensive and transversal manner between \nthe various constituent areas. The aim of this study ’s CPS is to reach the \nManufacturing Operations Management (MOM) or Manufacturing \nExecution System (MES) level. However, its inherent feature of scal-\nability allows the extension from the single process to the overall factory \nto the overall plant. \nAn example of process improvement through the application of the \nFig. 8.An example of process performance assessment using the AS-IS model. \nFig. 9.The study ’s approach has two levels: data collection and interpretation, aimed to gather data from the process, and CPS architecture and implementation, to \ndigitalize the existing data and possibly add new information. Measurement of KPIs is then applied to both industrial and digital aspects of the study to verify \nimprovements. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n287CPS model was realized through a preliminary simulation of raw data \ncoming from the Automated Potting Machine and connected to the \nsoftware architecture described in Fig. 11. First of all, the APM data \n(represented by a list of measurements and their timestamp) is included \nin a database. Once the database is collected, data is normalized and aggregated online according to the different timeframes and stored in \nthe data lake. In batches, such data is clustered and displayed in a \ndashboard. The data collection and visualization allow the monitoring, \ncontrol, and use of data analysis to detect process deviations for example \nto stop the line or alert operators. A possible dashboard and an example \nFig. 10.RUAG ’s sandwich composite panel manufacturing process is shown before (a) and after (b) the integration of existing sensors with an IoT network com-\nmanded by a computing infrastructure. Sensors measure temperature, pressure and humidity and scan panel ’s surface through optical and laser systems. Traceability \nis also performed through barcodes. The whole process is included in a tree-shaped system. The computing infrastructure is then represented in detail (c). Online \nprocessing of sensors ’ data inputs is performed through actions including preprocessing, normalization, thresholds ’ check, and monitoring. Processed data is then \nstored in a data lake, where users are able to have continuous open access, while data are interpreted by a statistical model-based closed-loop of KPIs ’ prediction and \nforecast and are displayed through a user-friendly visual dashboard. Some of the many SW platforms available in the market to realize such concept are mentioned \n[81–86,88–90,92]. \nFig. 11.The CPS∕layers as a flux of data from input \nto output. In the first layer data from interconnected \nsensors (IoT) is simulated or collected from historical \narchives, so that the process is reconstructed (DT). In \nthe cyber layer, i.e. the core of the CPS, data collec -\ntion, storage and analytics is done with the help of \nstatistical predictive models, allowing data correla -\ntion (AI). In the final layer, data can be visualized \nthrough reports and insights and interpreted with \nhuman touch, allowing to understand causation \neffects. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n288of graphs displayable as output are shown respectively in Fig. 12 and \nFig. 13. \nThe approach was extended to the whole process thanks to its layout \nreconstruction in the cyber space, see Fig. 10. In Fig. 14 a representation \nof the Sandwich Panel Manufacturing process using BPMN and a simu-\nlation through Bizagi Modeler allows the performance of a «what if » \nanalysis. This tool is useful to investigate costs and times needed to \nexecute the entire process. A top-down approach was applied: starting \nfrom a model of the macro-tasks, and then defining each task following \nthe most left representation to define each block as an independent \nprocess. This allows a detailed analysis, gaining a more realistic repre -\nsentation on the timing of the macro-block. Finally, the model reaches \nautomation level and is upgraded with a Markov-chain-based AI algo-\nrithm able to show probabilities of failure for sample properties of in-\nterest. The system upgrade can be categorized in three levels: \n1. Level 1 – “Process monitoring ” \nThis level is characterized by the ability of the CPS to process the \ncollected data automatically generating reports and sending alarms, \nbased on inputs pregiven manually. In case of failures being signalized, \nthe information provided allows the operators and/or process engineers \nto intervene and adjust the process parameters to address the issue. \nReports can assist in the identification of trends by displaying data over \na longer period. \n2. Level 2 – “Small-scale process control ” \nAt this level, further analysis and interpretation is performed auto-\nmatically by the AI algorithms to predict the outcome of the process. For \ninstance, the CPS can stop and restart the potting process with a new \ninsert if the probability of negative process outcome is high. Based on \nidentified trends, the CPS can signal potential failures before they occur. \nHowever, the system is incapable of adjusting any of the process pa-\nrameters to keep the process running and avoid the identified threats. \n3. Level 3 – “Large-scale process control ” \nAt level 3, the AI-assisted CPS can optimize the process parameters to \nachieve optimal process result – delivering the right product quality in \nthe shortest production time. It can perform continuous predictive analysis on all production system components using the data fed in real- \ntime by the sensor network. Based on the performance forecast, the CPS \ncan predict the completion time for each panel, tool exchange rates, and \nequipment maintenance intervals, thereby being able plan the entire \nmaterial flow through the station. At this stage, multiple production \nstations can be interconnected using the same CPS. \nTo reach these levels, capital investment in upgrading the production \nsystem is necessary. Table 2 shows estimated investment figures needed \nto support the CPS implementation. \n5.Conclusions \nThe paper contextualized Smart Manufacturing technologies in the \nfast-evolving market of large constellations of small satellites and \nrelated new production paradigms. A review of fundamental theoretical \nconcepts behind Industry 4.0 disruptive change was presented, focused \non Cyber-Physical Systems and their 5C-level standard architecture. \nPossible Smart Manufacturing solutions, in terms of hardware and \nsoftware technologies, were reviewed to contribute to a future signifi -\ncant improvement and optimization of a whole MAIT cycle. CPS, DT and \nIoT were selected as the most promising technologies to be adopted and \nRUAG ’s composite sandwich panel manufacturing process was taken as \ncase study. The process was reconstructed so that each sensor could be \nsimulated in the cyber space as a flux of data. In parallel, an assessment \nof the SM level of the process according to the Acatech maturity model \nwas carried on unlocking the process improvement potential. The flux of \ndata flowing from the sensing layer into the cyber layer of the CPS \nthrough an interconnected IoT network is represented by unit blocks \nrelated to each process step. The use of AI upgrades the model, giving it \nthe ability to also reach some level of process control and optimization. \nThree different levels of process improvement are identified each of \nwhich is linked to its economic estimation of the necessary computing \ninfrastructure. By this model equipment data can be interpreted through \nits pre-processing, normalization, storage and distribution to a user- \nfriendly visual dashboard, according to a new logical analysis of the \nindustrial process, delivering the final improvement, represented by the \nopportunity of reconfiguring the production line to reach the goals \nmeasured by traditional Key Performance Indicators (KPIs), among \nwhich panel production rate and Overall Equipment Efficiency (OEE), \nand optimize specific parameters related to SM, such as process agility \nand flexibility and the CPS scalability. \nFig. 12.An example of the CPS dashboard. M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n289Declaration of competing interest \nThe authors declare that they have no known competing financial \ninterests or personal relationships that could have appeared to influence \nthe work reported in this paper. \nAcknowledgment \nThe present paper results from the project “Smart Manufacturing for \nfuture constellations ” funded by the European Space Agency (ESA ITT \nAO/1 –10002/19/NL/AR for technology development) and developed in \ncollaboration by Sapienza University of Rome, Thales Alenia Space Italy \nand RUAG Space. \nReferences \n[1]M. Blanchet, THINK ACT. INDUSTRY 4.0. The New Industrial Revolution. How \nEurope Will Succeed, Roland Berger, March 2014 . [2]E. S. Agency, What Is Space 4.0? [Online]. Available, November 2021. November \n2021, https://www.esa.int/About_Us/Ministerial_Council_2016/What_is_space_4. \n0. \n[3]R.Y. Zhong, X. Xua, E. Klotz, S.T. Newmanc, Intelligent manufacturing in the \ncontext of industry 4.0: a review, Engineering 3 (2017) 616–630. \n[4]C. Daehnick, I. Klinghoffer, B. Maritz, B. Wiseman, “Large LEO Satellite \nConstellations: Will it Be Different This Time?, ” McKinsey &Co, Aerospace and \nDefence Practice, May 2020 . \n[5]UK saves OneWeb, Spaceflight 62 (September) (2020) . \n[6]J. Hou, Y. Zhao, Y. Zhou, X. Du and Z. Li, “The creative application of DIY \nmanufacturing technology in remote sensing satellite, ” Aero. China. Vol. 17. N.2, \nSummer 2016. \n[7]K. Jackson, K. Efthymioua, J. Borton, “Digital Manufacturing and Flexible \nAssembly Technologies for Reconfigurable Aerospace Production Systems, ” \nChangeable, Agile, Reconfigurable & Virtual Production Conference, 2016 . \n[8]A. Kusiak, Smart manufacturing, Int. J. Prod. Res. 56 (2018) 508–517. \n[9]S. Marigonda, “Smart Manufacturing: sfide e opportunit ˇa.,” Digital Tools 4.0. \n[10] L. Li, China ’s manufacturing locus in 2025: with a comparison of “Made-in-China \n2025 ” and “Industry 4.0, Technol. Forecast. Soc. Change 135 (2018) 66–74. \n[11] L.D. Xu, Industry 4.0: state of the art and future trends, Int. J. Prod. Res. 56 (8) \n(2018) . \n[12] C. Bryson, Heritage and Satellite Manufacturing: Firm-Level Competitiveness and \nthe Management of Risk in Global Production Networks, Economic Geography, \n2019, pp. 423–441. \n[13] C. Salkin, M. Oner, A. Ustundag, E. Cevikcan, A Conceptual Framework for \nIndustry 4.0, 2018 . \n[14] K. Nakamoto, K. Shirase, Simulation technologies for the development of an \nautonomous and intelligent machine tool, Int. J. Autom. Technol. (2013), https:// \ndoi.org/10.20965/ijat.2013.p0006 . \n[15] K.D. Thoben, S. Wiesner, T. Wuest, Industrie 4.0’ and smart manufacturing – a \nreview of research issues and application examples, Int. J. Autom. Technol. 11 (1) \n(January 2017) 4–16. \n[16] G.G. Schuh, Industrie 4.0 Maturity Index. Managing the Digital Transformation of \nCompanies [Online]. Available:, 2017. February 2021, https://hal.archives-ouver \ntes.fr/hal-02455705 . \n[17] V. Cruz-Machado, Scanning the industry 4.0: a literature review on technologies \nfor manufacturing systems, Engineering Science and Technology, an International \nJournal 22 (3) (June 2019) 899–919. \n[18] D.P. Perales, F.A. Valero, A.B. García, Industry 4.0, A Classification Scheme, 2018 . \n[19] O. Cardin, Classification of cyber-physical production systems applications: \nproposition of an analysis framework, Comput. Ind. 104 (January 2019) 11–21, \nhttps://doi.org/10.1016/j.compind.2018.10.002 . \n[20] A. Rojko, Industry 4.0 concept: background and overview, International Journal of \nInteractive Mobile Technologies 11 (5) (2017) . \nFig. 13.Examples of graphs showable by the dashboard: the first graph represents the single operation ’s timing vs time, the second one the production efficiency vs \ntime and the last one a map of discarded APM inserts for adhesive quantity. Scales are not shown for confidential reasons. \nFig. 14.The process layout represented in the cyber space and its focus at APM. \nTable 2 \nProduction volume requirements - rough order of magnitude estimates. \nCPS \nUpgrade \nLevel Level \nDescription Estimated \nMachine \nProcurement \nCost Increase \n[%] Estimated CPS \nImplementation \nand Operation \nCost [EUR] Minimum \nProduction \nVolume \n[inserts] \nLevel 1 Process \nmonitoring 3–5 42∕000 €/5 years 20.000 \nLevel 2 Small-scale \nprocess \ncontrol 10–15 55∕500 €/5 years 200.000 \nLevel 3 Large-scale \nprocess \ncontrol 40–60 82∕500 €/5 years 1.000.000 M. Eugeni et al. \nActa Astronautica 192 (2022) 276–290\n290[21] B.-h. Li, H. Bao-cun, L. Xiao-bing, Y. Chun-wei, Y. Wen-tao, Applications of \nartificial intelligence in intelligent manufacturing: a review, Frontiers of \nInformation Technology & Electronic Engineering 18 (1) (2017) 86–96. \n[22] J. Jadaan, K.S. Siderska, Cloud manufacturing: a service-oriented manufacturing, \nEngineering Management in Production and Services 10 (1) (2018) 22–31. \n[23] N. Khan, I. Yaqoob, I. Abaker, T. Hashem, Z. Inayat, W. Kamaleldin, A. Mahmoud, \nM. Alam, M. Shiraz, A. Gani, Big Data: Survey, Technologies, Opportunities, and \nChallenges, ” The Scientific World Journal, July 2014 . \n[24] C. Duke, G. Sadlier, D. Herr, Industry 4.0 and the Future of UK Space, ” London \nEconomics, 2019 . \n[25] E. Sisinni, A. Saifullah, S. Han, U. Jennehag, M. Gidlung, Industrial internet of \nthings: challenges, opportunities, and directions, IEEE Trans. Ind. Inf. 10 (10) \n(2018) . \n[26] H. Li, Application research of virtual reality and augmented reality, Advances in \nIntelligent Systems and Computing 1233 (2021) 494–499. \n[27] Federal Ministry for Economic Affairs and Energy, Plattform Industrie 4.0 - \nRAMI4.0 – a reference framework for digitalisation, Plattf. Ind. 4 (2019), 0. \n[28] M. Yli-Ojanper aa, S. Sierla, N. Papakonstantinou, V. Vyatkin, Adapting an agile \nmanufacturing concept to the reference architecture model industry 4.0: a survey \nand case study, Journal of Industrial Information Integration 15 (2019) 147–160. \n[29] J.H. Kim, A review of cyber-physical system research relevant to the emerging IT \ntrends: industry 4.0, IoT, big data, and cloud computing, Journal of Industrial \nIntegration and Management 2 (3) (2017) . \n[30] H. Gill, R. Baheti, Cyber-physical systems, in: T. Samad, A.M. Annaswamy (Eds.), \nThe Impact of Control Technology, 2011 . \n[31] H. Gill, R. Baheti, Cyber-physical Systems: from Theory to Practice, 2011 . \n[32] L. Monostori, Cyber-physical systems in manufacturing, CIRP Ann 65 (2) (2016) \n621–641. \n[33] R. Rajkumar, I. Lee, L. Sha, J. Stankovic, Cyber-physical systems: the next \ncomputing revolution, Des. Autom. Conf. (2010) 731–736. \n[34] A. Napoleone, M. Macchi, A. Pozzetti, A review on the characteristics of cyber- \nphysical systems for the future smart factories, J. Manuf. Syst. 54 (December) \n(2019) . \n[35] S. Thiede, M. Juraschek, C. Herrmann, Implementing cyber-physical production \nsystems in learning factories, Procedia CIRP 54 (2016) 7–12. \n[36] C. Zhan, Y. Chen, A review of research relevant to the emerging industry trends: \nindustry 4.0, IoT, blockchain, and business analytics, Journal of Industrial \nIntegration and Management 5 (1) (2020) 165–180. \n[37] H. Chen, Theoretical foundations for cyber-physical systems: a literature review, \nJournal of Industrial Integration and Management 2 (3) (2017) . \n[38] Y. Lu, Cyber physical system (CPS)-based industry 4.0: a survey. Journal of \nIndustrial Integration and Management, Journal of Industrial Integration and \nManagement 2 (3) (2017) . \n[39] G.K. Rand, N. Singh, D. Rajamani, Cellular manufacturing systems design, planning \nand control, J. Oper. Res. Soc. (1997) . \n[40] T. Pultarova, “Satellite Manufacturing in a State of Transition, ” [Online]. \nAvailable: http://interactive.satellitetoday.com/via/march-2019/satellite-manu \nfacturing-in-a-state-of-transition/_fragment.html . [Accessed October 2020]. \n[41] P.M. Laurent Jaffarta, Constellations: The satellite serial production challenge, in: \n71st International Astronautical Congress (IAC) – the CyberSpace Edition, October \n2020, pp. 12–14. \n[42] e. directory, “WorldView legion constellation, ” European Space Agency. [Online]. \n[Accessed February 2021]. \n[43] C. Hofacker, How to Make a Megaconstellation, March 2020 [Online]. Available: \nhttps://aerospaceamerica.aiaa.org . \n[44] T. Gornet, T. Wohlers, History of Additive Manufacturing, ” Wohlers, 2014 . \n[45] A. Javaid, M. Haleem, Additive manufacturing applications in industry 4.0: a \nreview, Journal of Industrial Integration and Management 4 (4) (2019) . \n[46] K. Schwab, The Fourth Industrial Revolution, Portfolio Penguin, 2017 . \n[47] A. B˘ecue, CyberFactory#1 – securing the Industry 4.0 with cyber-ranges and digital \ntwins, in: IEEE, 2018 . \n[48] HMI/SCADA software in the age of Industrial IoT and evolving human machine \ninterfaces, ” I-Scoop, [Online]. Available: https://www.i-scoop.eu/industry-4-0/h \nmi-scada-software/ . [Accessed February 2021]. \n[49] Y. Lu, Current Standards Landscape for Smart Manufacturing Systems, ” National \nInstitute of Standards and Technology - US Department of Commerce, February \n2016 . \n[50] H. Wang, Enterprise system and its application in aerospace industry, Journal of \nIndustrial Integration and Management 2 (2) (2017) . \n[51] I.C. Reinhardt, Current perspectives on the development of industry 4.0 in the \npharmaceutical sector, Journal of Industrial Information Integration 18 (3) (2020) . \n[52] H. Wu, S. Li, L.D. Xu, Internet of things in industries: a survey, IEEE Trans. Ind. Inf. \n10 (4) (2014) 2233 –2243 . \n[53] A. B˘ecue, A new concept of digital twin supporting optimization and resilience of \nfactories of the future, Appl. Sci. 10 (2020) 4482 . \n[54] T. Fei, Z. Meng, Digital twin shop-floor: a new shop-floor paradigm towards smart \nmanufacturing, IEEE Access 5 (2017) . \n[55] H. Gill, R. Baheti, Cyber-physical systems. The impact of control technology, IEEE \nControl Systems Society 1 (2011) . \n[56] E.A. Lee, Cyber physical systems: design challenges, in: 11th IEEE. International \nSymposium on Object and Component-Oriented Real-Time Distributed Computing, \nISORC)., 2008, pp. 363–369. [57] M. Abdirad, A two-stage metaheuristic algorithm for the dynamic vehicle routing \nproblem in industry 4.0 approach, J. Manag. Anal. 1 (15) (2020) . \n[58] J. Lee, B. Bagheri, H.A. Kao, A Cyber-Physical Systems architecture for Industry \n4.0-based manufacturing systems, Manufacturing Letters 3 (2015) 18–23. \n[59] G. Aceto, V. Persico, A. Pescap ˘e, Industry 4.0 and health: internet of things, big \ndata, and cloud computing for healthcare 4.0, Journal of Industrial Information \nIntegration 18 (2020) . \n[60] X. You, Towards 6G Wireless Communication Networks: Vision, Enabling \nTechnologies, and New Paradigm Shifts, vol. 64, Science China - Information \nSciences, 2021 . \n[61] Y. Lu, Security in 6G: the prospects and the relevant technologies, Journal of \nIndustrial Integration and Management 5 (3) (2020) 271–289. \n[62] A. Cusano, P. Salvarezza, G. Breglio, A. Cutolo, A. Calabr ˇo, M. Giordano, S. De \nNicola, An integrated fiber optic sensing system for in situ characterization of the \ncuring, Proc. SPIE 4328 (2001) 275–284. \n[63] K.H. Wood, T.L. Brown, M.C. Wu, C.B. Gause, Fiber Optic Sensors for Cure-Health, \n” Proceeding 3rd Intern. Workshop on Structural Health, 2001, pp. 1149 –1157 . \n[64] K. Saddik, M. Alam, A. El, C2ps: a digital twin architecture reference model for the \ncloud-based cyber-physical systems, IEEE Access 5 (2017) 2050 –2062 . \n[65] R.D. Finlayson, M. Friesel, M. Carlos, P. Cole, Health monitoring of aerospace \nstructures with acoustic emission and acousto-ultrasonics, in: 15th World \nConference on Non-destructive Testing, 2000 . \n[66] V. Giurgiutiu, A. Zagrai, J.J. Bao, Piezoelectric wafer embedded active sensors for \naging aircraft structural health monitoring, Int. J. Struct. Health Monitor. \nNovember (2001) . \n[67] D. Devillers, F. Taillade, D. Osmont, D. Balageas, D. Royer, Interaction of Lamb \nwaves with defects in composite sandwich structures, in: European COST F3 \nConference on System, 2000 . \n[68] J.S. Kim, K.J. Vinoy, V.K. Varadan, Wireless health monitoring of cracks in \nstructures with MEMS-IDT sensors, Proc. SPIE 4700 (2002) 342–353. \n[69] S.J. Burgett, M. Kranz, MEMS sensor systems developments at AMCOM for \nenvironmental conditions monitoring, in: Proc. 3 Rd Intern. Workshop on \nStructural Health Monitoring, 2001, pp. 1134 –1141 . \n[70] D. Chung, Structural health monitoring by electrical resistance measurement, \nJournal of smart materials and structures 10 (2001) 624–636. \n[71] L. Zhang, Novel self-healing CFRP composites with high glass transition \ntemperatures, Compos. Sci. Technol. 168 (2018) 96–103. \n[72] W.H. Prosser, T.L. Brown, S.E. Woodard, G.A. Fleming, E.G. Cooper, Sensor \ntechnology for integrated vehicle health management of aerospace vehicles, in: AIP \nConference Proceedings, vol. 657, 2003, p. 1582 . \n[73] P. Gaudenzi, M. Bernabei, E. Dati, G. De Angelis, M. Marrone, L. Lampani, On the \nevaluation of impact damage on composite materials by comparing different NDI \ntechniques, Compos. Struct. 118 (2014) 257–266. \n[74] X. Maldague, Theory and Practice of Infrared Thermography for Non Destructive \nTesting, John Wiley & Sons, Canada, 2001 . \n[75] S. Deane, Application of NDT thermographic imaging of aerospace structures, \nInfrared Phys. Technol. 97 (2019) 456–466. \n[76] D.I. Gillespie, Defect detection in aerospace sandwich composite panels using \nconductive thermography and contact sensors, Sensors 20 (2020) . \n[77] R.D. Finlayson, M. Friesel, M. Carlos, P. Cole, Health monitoring of aerospace \nstructures with acoustic emissions and acousto-ultrasonics, in: 15th World \nConference on Non-destructive Testing, October 2020 . \n[78] P. Daryabor, M.S. Safizadeh, Image fusion of ultrasonic and thermographic \ninspection of carbon/epoxy patches bonded to an aluminum plate, NDT E Int. 90 \n(2017) 1–10. \n[79] P. Gaudenzi, Smart Structures: Physical Behaviour, Mathematical Modelling and \nApplications, John Wiley Sons, 2009 . \n[81] [Online]. Available:, Elastic, November 2021. Accessed November 2021, http \ns://www.elastic.co/ . \n[82] Grafana [Online]. Available: Accessed November 2021, https://grafana.com/ , \nNovember 2021. \n[83] Ignite [Online]. Available: Accessed November 2021, https://ignite.apache.org/ , \nNovember 2021. \n[84] Kafka [Online]. Available: Accessed November 2021, https://kafka.apache.org/ , \nNovember 2021. \n[85] Kibana [Online]. Available: Accessed November 2021, https://www.elastic. \nco/kibana/ , November 2021. \n[86] Pytorch [Online]. Available: Accessed November 2021, https://pytorch.org/ , \nNovember 2021. \n[88] [Online]. Available:, Scikit Learn, November 2021. Accessed November 2021, \nhttps://scikit-learn.org/ . \n[89] Tensorflow [Online]. Available: Accessed November 2021, https://www.tensor \nflow.org/ , November 2021. \n[90] Redis [Online]. Available: Accessed November 2021, https://redis.io , November \n2021. \n[91] M. Li, “Spatial-Temporal Finite Element Analytics for CPS-Enabled Smart Factory: \nApplication in Hybrid Flow Shop, ” Procedia Manufacturing, 2020, pp. 1229 –1236 . \n[92] “Flink Flink [Online]. Available: Accessed November 2021, https://flink.apache. \norg/, November 2021. M. Eugeni et al. ",
"metadata": {
"filename": "An industry 4.0 approach to large scale production of satellite 2022.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\An industry 4.0 approach to large scale production of satellite 2022.pdf",
- "file_size": 8180979,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:35.186178",
- "content_length": 85542
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\An industry 4.0 approach to large scale production of satellite 2022.pdf",
+ "size": 8180979,
+ "source": "docs_to_import"
+ },
+ "id": "93a802ed-36af-48c8-ac94-4bac559d4f39"
},
- "b9f85718-1117-4a9a-ad4a-1eade3ffcda1": {
- "id": "b9f85718-1117-4a9a-ad4a-1eade3ffcda1",
- "content": "[Página 1]\nAssessing business value of Big Data Analytics in European firms☆\nNadine Côrte-Real ⁎, Tiago Oliveira, Pedro Ruivo\nNOVA IMS, Universidade Nova de Lisboa, 1070-312, Lisboa, Portugal\nabstract article info\nAvailable online 9 August 2016 In the strategic management field, dynamic capabilities (DC) such as organizational agility are considered to be\nparamount in the search for competitive advantage. Recent research claims that IT business value research\nneeds a more dynamic perspective. In particular, the Big Data Analytics (BDA) value chain remains unexplored.\nTo assess BDA value, a conceptual model is proposed based on a knowledge-based view and DC theories. Toempirically test this model, the study addresses a survey to a wide range of 500 European firms and their IT\nand business executives. Results show that BDA can provide business value to several stages of the value chain.\nBDA can create organizational agility through knowledge management and its impact on process and\ncompetitive advantage. Also, this paper demonstrates that agility can partially mediate the effect betweenknowledge assets and performance (process level and competitive advantage). The model explains 77.8% of\nthe variation in competitive advantage. The current paper also presents theoretical and practical implications\nof this study, and the study's limitations.\n© 2016 Elsevier Inc. All rights reserved.Keywords:\nBig Data Analytics (BDA)\nIT business value\nKnowledge Based View (KBV)Dynamic capabilities (DC)Organizational agilityCompetitive advantage\n1. Introduction\nIn the era of Big Data, firms in every sector are required to deal with a\nhuge amount of data. Data in vast amounts can offer invaluable insights\nand competitive advantage if the right technological and organizational\nresources support them ( Morabito, 2015 ). Recently, several academics\nand practitioners have stressed the need to understand how, why, and\nwhen Big Data Analytics (BDA) applications can be a valuable resource\nfor companies to gain competitive advantage ( Abbasi, Sarker, &\nChiang, 2016; Agarwal & Dhar, 2014; Corte Real, Oliveira, & Ruivo,\n2014; LaValle et al., 2011 ). Although BDA technologies have been\nrecognized as the “next big thing for innovation ”(i.e., a potential source\nof business value and competitive advantage), the BDA value chain\nremains relatively unexplored and needs further investigation. No\nempirical research exists assessing how BDA can bring business value\n(Abbasi et al., 2016 ), establishing a linkage between knowledge assets,\norganizational agility, and performance (process-level and competitive\nadvantage) ( Corte Real et al., 2014 ). Firms that inject BDA in their\nbusiness operations can surpass their peers by 5% in productivity and\n6% in pro fitability ( Barton, 2012 ). For that reason, European firms are\ninvesting heavily in BDA technologies ( SAS, 2013; Sharma, Mithas, &\nKankanhalli, 2014 ). Nevertheless, this investment can only be valuableif organizations use the appropriate technology and organizational\nresources to achieve competitive advantage ( Manyika et al., 2011a ).\nIn response to the scarcity of research on this subject, this study\nexamines the impact of BDA on the business value chain in a\nEuropean context by empirically testing a new theoretical frame-\nwork that merges two strategic management theories (Knowledge\nB a s e dV i e w( K B V )a n dd y n a m ic capabilities (DC)) at firm-level. Not\nonly does this paper extend BDA research by transposing, merging,\nand examining hypotheses in IT innovations and management fields,\nbut also contributes to DC research by empirically assessing the ante-\ncedents and impacts of a speci fic dynamic capability (organizational\nagility), when using BDA technologies. This is the first paper that\nstudies the entire BDA value chain at firm-level, linking concepts of\nknowledge management, agility, and performance (process-level\nand competitive advantage). To clarify the role of agility on perfor-\nmance, this papers tests if agility is a mediator of knowledge assets\non performance (process-level performance and competitive\nadvantage). The study explores the following three research ques-\ntions (RQs):\nRQ1 –What are the BDA enablers for the creation of organizational\nagility?RQ2 –What are the impacts of this dynamic capability created by\nBDA on sustainable competitive advantage?\nRQ3 –Is agility a mediator of knowledge assets on performance\n(process-level performance and competitive advantage)?Journal of Business Research 70 (2017) 379 –390\n☆The author is grateful for the comments by anonymous reviewers, on earlier drafts of\nthis article.\n⁎Corresponding author.\nE-mail address: nreal@novaims.unl.pt (N. Côrte-Real).\nhttp://dx.doi.org/10.1016/j.jbusres.2016.08.011\n0148-2963/© 2016 Elsevier Inc. All rights reserved.\nContents lists available at ScienceDirect\nJournal of Business Research\n\n[Página 2]\nThis study offers guidance for executives and managers to assess the\nconditions under which BDA can add business value to organizations.\nManagers and IT executives can bene fit from an evaluation instrument\nto assess the impact of BDA. Also, this paper provides valuable support\nto justify BDA investments and initiatives. Firms that have not yet\ndecided to adopt these technologies can obtain a view of potential\ngains from adopting and effectively using BDA. This research demon-strates how best to leverage the knowledge embedded in BDA systems,\nacquiring organizational agility capabilities that lead toward competi-\ntive advantage.\nThe remainder of this paper has the following structure: Section 2\nprovides an introduction to the BDA concept and a theoretical\nbackground to assess BDA initiatives; Section 3 presents the conceptual\nmodel and the hypotheses; Section 4 outlines the methodology; and\nSection 5 shows the empirical results. Finally, the paper presents a\ndiscussion and the conclusions from the findings.\n2. Background2.1. Big Data Analytics\nChen, Chiang ( Chen, Chiang, & Storey, 2012 ) coined the term Big\nData Analytics (BDA) as a related field of business intelligence &\nanalytics (BI&A), referring to the BI&A technologies that mostly concern\ndata mining and statistical analysis. Authors de fine BDA as “an e w\ngeneration of technologies and architectures, designed to economically\nextract value from very large volumes of a wide variety of data, by enabling\nhigh velocity capture, discovery and/or analysis. ”(IDC, 2011 ). BDA tech-\nnologies allow firms to improve existing applications by offering\nbusiness-centric practices and methodologies that provide a competi-\ntive advantage ( Chen et al., 2012; Davenport, 2006 ). The latest literature\nindicates that there is much room for further BDA research ( Abbasi\net al., 2016; Agarwal & Dhar, 2014; Erevelles, Fukawa, & Swayne,\n2016 ). There are already academic studies that re flect the adoption\nand use of BDA (e.g., ( Malladi, 2013; Xu, Frankwick, & Ramirez, 2016;\nKwon, Lee, & Shin, 2014 )). Regarding value, most BDA academic studies\nfocus on analyzing business value from a data or system perspective\n(e.g., ( LaValle et al., 2011; Kwon et al., 2014 )). From the strategic\nmanagement perspective only one conceptual paper explores how\nBDA affects several marketing activities ( Erevelles et al., 2016 ). The\nremaining literature addresses industry primarily ( LaValle et al., 2011;\nRussom, 2011 ). As firms do not know how to capture business value\n(Barton, 2012; LaValle et al., 2011 ), some scholars ( Corte Real et al.,\n2014; Malladi, 2013 ) argue that BDA value research is scarce and\nneeds to extend beyond post-adoption stages toward competitiveness\n(Erevelles et al., 2016; Xu et al., 2016 ). Although numerous approaches\nassess IT Value at the process and firm levels (see Schryen ( Schryen,\n2013 ) for a review), this study extends IT business value research\nfrom the strategic management perspective, by empirically assessing\nthe BDA business value chain in European firms.\n2.2. Theoretical foundation\nMany studies in recent decades investigate IT business value and\ncompetitive advantage using the resource-based view (RBV) ( Barua,\nKriebel, & Mukhopadhyay, 1995; Bharadwaj, 2000; Mata, Fuerst, &\nBarney, 1995; Melville, Kraemer, & Gurbaxani, 2004; Ruivo, Oliveira, &\nNeto, 2015; Soh & Markus, 1995; Zhu & Kraemer, 2005 ). The limitations\nof RBV encourage the use of other theories such as DC and KBV ( Arend &\nBromiley, 2009; Wang & Ahmed, 2007 ). As DC theory constitutes the\nsecond foundation that supports knowledge-based thinking ( Pettigrew,\nThomas, & Whittington, 2001 ), this study combines these theories. KBV\nexplores a firm's potential to acquire competitiveness in a dynamic\nmarket context, but only DC theory can solve the problem of sustaining\ncompetitive advantage in turbulent environments ( Grant, 1996;\nVolberda, 1996 ).2.2.1. Knowledge Based View theory\nKBV states that a firm's knowledge resources are unique and\ninimitable and that the firm's primary function is to leverage them\ninto productive outcomes ( Grant, 1996; Nonaka, 1995 ). The possession\nof knowledge resources gives the firm basic foundations to renew or re-\nconfigure its resource base and to build dynamic capabilities ( Wu,\n2006 ), such as organizational agility. Companies that have high levels\nof staff knowledge and involvement can more skillfully identify the\nneed to make changes to existing resources and decide about the ac-\ntions necessary to implement these changes ( Nieves & Haller, 2014 ).\nKBV theory can help to conceptualize the performance effects of IT in-\nvestments ( Pavlou et al., 2005 ). Management studies use this theory\n(e.g., ( Nieves & Haller, 2014 )), as do studies in IT fields (e.g., ( Sher &\nLee, 2004 )) to understand the role of knowledge management in the\ncreation of DC. In BDA technologies, Xu, Frankwick ( Xu et al., 2016 )\nseek to understand the relationships among traditional marketing\nanalytics, BDA, and new product success. The current paper is the first\nthat empirically tests KBV to understand the role of BDA in the creation\nof agility.\n2.2.2. Dynamic capability theory\nIn the past decade the DC perspective arose as one of the most\neffective theoretical lenses for the strategic management field\n(Schilke, 2014 ), attracting the interest of scholars not only in business,\nbut also in the IT management field ( Helfat et al., 2009; Protogerou,\nCaloghirou, & Lioukas, 2012 ). Rooted in RBV and KBV, DC argues that\nthe dynamic capabilities enable firms to modify their resource to\nadapt rapidly to changing conditions, helping them to sustain their\ncompetitive advantage over time ( Helfat & Peteraf, 2009; Teece,\nPisano, & Shuen, 1997 ). Although the literature has a broad range of\ndefinitions for DC, one of the seminal papers de fines DC as “the ability\nto integrate, build, and recon figure internal and external competencies to\naddress rapidly-changing environments ”(Teece et al., 1997 ). DC\ndisaggregates into “the capacity (1) to sense and shape opportunities\nand threats, (2) to seize opportunities, and (3) to maintain competitive-ness through enhancing, combining, protecting, and, when necessary,\nrecon figuring the business enterprise's intangible and tangible assets ”.\nSome authors argue that agility is an organizational dynamic\ncapability ( Blome, Schoenherr, & Rexhausen, 2013; Sambamurthy\net al., 2007; Zhou & Wu, 2010 ). Teece ( Teece, 2007 )d efines agility as a\nhigher-order dynamic capability that emerges over time, generally\ndefining agility as a capability with which firms can identify and re-\nspond to environmental threats and opportunities and quickly adjust\ntheir behaviors ( Goldman, Nagel, & Preiss, 1995; Sambamurthy,\nBharadwaj, & Grover, 2003 ). This concept also relates to the operational\nflexibility of organizational processes and IT systems to support\nstructured or unstructured changes ( Chen et al., 2014 ). Achieving agility\ndemands processing a large and varied amount of information\n(Goldman et al., 1995 ). This process is possible with BDA applications.\nHowever, like IT applications ( Sambamurthy et al., 2003; Weill,\nSubramani, & Broadbent, 2002 ), BDA tools cannot automatically\nimprove agility. In fact, under certain conditions BDA tools can impede\nagility ( Chen et al., 2014 ). For this reason, the need exists to understand\nhow BDA applications can create agility.\nSeveral recent studies in the business management field apply DC\ntheory to measure the in fluence of DC in the creation of competitive ad-\nvantages (e.g., Schilke, 2014; Zott, 2003; Drnevich & Kriauciunas, 2011 ).\nIn the IT management field, few empirical studies use this theory.\nAnalyzing the IT in fluence on DC generically, ( Chen et al., 2014; Sher\n& Lee, 2004 ), researchers conclude that IT is an enabler of DC in\norganizations. Regarding agility, several studies assess the impact of IT\non organizational agility (e.g., Sambamurthy et al., 2007; Chen et al.,\n2014; Cai et al., 2013; Tallon & Pinsonneault, 2011; Liu et al., 2013; Lu\n& Ramamurthy, 2011 ). These studies demonstrate a positive relation-\nship between IT and agility. Chen ( Chen et al., 2014 ) recently concludes\nthat the IT business value essentially depends on how agile a firm is380 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 3]\nwith regard to managing business processes. Although the literature\naddresses the impact of IT on the creation of organizational agility, no\nstudy links BDA with this speci fic DC. Apart from some qualitative stud-\nies in the area of business analytics (BA) ( Shanks & Bekmamedova,\n2013; Shanks & Sharma, 2011 ), only conceptual papers use DC theory\nto study BDA value ( Corte Real et al., 2014; Erevelles et al., 2016 ).\nFirms that do not develop the resources and capabilities to use BDA\napplications will struggle to develop a sustainable competitive advan-\ntage ( Erevelles et al., 2016 ). Given that agility is vital for companies´\nsurvival, and that BDA can support organizational business processes,\nthis study fills this academic gap and links the two concepts empirically.\n3. Conceptual model\nWith recourse to the two strategic management theories (KBV and\nDC) discussed above, this section explains the conceptual model and\nthe speci fic hypotheses ( Fig. 1 ).\nRooted in an earlier conceptual model ( Corte Real et al., 2014 ), this\nresearch model empirically tests 12 propositions. The study assesses\nthe entire value chain starting with how BDA can leverage different\nforms of knowledge to create organizational agility ( H1,H2,H3). BDA\ntechnologies can provide organizational agility to the firm by using\neffective knowledge management. Firms owning this type of dynamic\ncapability can achieve competitive advantage directly ( H4a)o ri n d i r e c t -\nly through business processes ( H4b). Results obtained by using business\nprocesses will impact the overall organization ( H5). Agility can also\nmediate the relationship between knowledge assets and performance\n(H6a,b,c-H7a,b,c). BDA uses some controls such as country, industry,\ntechnological turbulence, and time.\n3.1. Hypothesis3.1.1. Knowledge assets\nOrganizational knowledge such as operational routines, skills, and\nknow-how constitutes a key source of competitiveness ( Grant, 1996 ).\nKnowledge management plays a critical role in pro ficiently managing\ndata and delivering it to the end users to support business processes\n(Rajpathak & Narsingpurkar, 2013 ). Knowledge management repre-\nsents a dimension supported by KBV ( Ruggles, 1998 ) and enables\ndynamic capabilities by offering speci fic functional competences that\ncan improve business performance ( Teece et al., 1997 ). A naturalrelationship exists between KM and BDA. Both deal with intangible\nassets such as data, knowledge, and intelligence ( Erickson & Rothberg,\n2015 ). BDA is a source of knowledge management, allowing firms to\nadd value primarily at the beginning of the information value chain\nand helping knowledge to flow to achieve business excellence ( Chau\n& Xu, 2012; Popovi čet al., 2012 ).\nBig data is a potential knowledge asset, contingent upon the proper\nuse of that knowledge ( Erickson & Rothberg, 2015\n). BDA represents\ntechnologies drivers of a strategic knowledge asset (big data). BDA\napplications have the potential to add value by providing more\ntransparent and accurate results to support decision-making in several\nbusiness areas ( Manyika et al., 2011a ).\nBDA strategy requires the capacity to sense, acquire, process, store,\nand analyze the data and convert that data into knowledge ( Rajpathak\n& Narsingpurkar, 2013 ). Several empirical studies state that the knowl-\nedge processes are antecedent dimensions of successful DC, by allowing\nfirms to continually renew their knowledge base and deliver business\nperformance ( Ambrosini & Bowman, 2009; Sher & Lee, 2004; Zheng,\nZhang, & Du, 2011 ). As DC are information-intensive ( Pavlou & El\nSawy, 2011 ), BDA may help in the creation of DC and organizational\nagility speci fically. Using BDA technologies helps to store and share\nknowledge, thereby allowing for an improvement of organizational\nknowledge by promoting ef ficiency within an organization, particularly\nby data integration and the use of analytical tools ( Russom, 2011 ). Some\nauthors argue that firms must combine endogenous and exogenous\nknowledge to achieve DC ( Sher & Lee, 2004 ). Zhao ( Cai et al., 2013 )\nargues that IT capability and KM capability are important in fostering\norganizational agility. Agility is promoted through knowledge manage-\nment by improving innovative responses, and can improve through the\nuse of IT and automated business processes ( Cai et al., 2013 ). In the\nsame way, organizations should be able to use BDA technologies to\nconvert knowledge into new routines and enhance organizational\nagility. Based on these findings, the hypotheses are:\nH1. BDA technologies allow an effective endogenous knowledge\nmanagement that positively in fluences dynamic capabilities such as\norganizational agility.\nH2. BDA technologies allow an effective exogenous knowledge\nmanagement that positively in fluences dynamic capabilities such as or-\nganizational agility.\nFig. 1. Proposed conceptual model.381 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 4]\nKnowledge sharing with key channel partners refers to the extent to\nwhich a firm shares insights and know-how about its business context\nwith its partners ( Saraf, Langdon, & Gosain, 2007 ). Channel partners\nare considered to be tactically and strategically important for\ncompanies. They can help to collect crucial market-related information\nwith which to fine tune the strategy to meet customer needs, resulting\nin long-term financial performance ( Lorenzoni & Lipparini, 1999 ).\nLiterature points out that the collaborative knowledge sharing capacity\nprovides an opportunity to increase value (e.g.,( Saraf et al., 2007 )) and\nenable DC (e.g., ( Della Corte & Del Gaudio, 2012 )). Considering that\nDC theory encompasses several levels of analysis, it is important to\nconsider the relational view, including the ability to collaborate with\nchannel partners ( Teece, 2007 ). Literature shows that agility needs the\nsupport of effective knowledge sharing ( Liu, Song, & Cai, 2014 ). Some\nstudies link the knowledge sharing capability through IT with agility\n(e.g., ( Cai et al., 2013; Liu et al., 2014 )). Such interactions can also\nbenefit from the use of BDA technologies, consequently enhancing\norganizational agility by in fluencing the capabilities to sense opportuni-\nties and threats, shape them, and seize them ( Della Corte & Del Gaudio,\n2012 ). Therefore, another hypothesis is:\nH3. BDA technologies allow an effective knowledge sharing with\npartners that positively in fluences organizational dynamic capabilities\nsuch as organizational agility.\n3.1.2. Organizational agility\nDC can play a key role in determining a firm's competitive advantage\n(Teece et al., 1997; Zott, 2003 ). Agility is the “capacity of an organization\nto efficiently and effectively redeploy/redirect its resources to value cre-\nating and value protecting (and capturing) higher-yield activities as in-\nternal and external circumstances warrant ”(Teece, Peteraf, & Leih,\n2016 ). In the management field several researchers recognize that DC\ndoes not lead directly to sustainable competitiveness, and that this\nvalue derives from improved business processes (e.g., ( Schilke, 2014;\nDrnevich & Kriauciunas, 2011 )). Some authors conclude that agility\ncan in fluence organizational performance ( Cai et al., 2013; Liu et al.,\n2013; Tallon & Pinsonneault, 2011 ). Hence, additional hypotheses are:\nH4a. Organizational agility is a dynamic capability leveraged by BDA\nthat positively affects the creation of competitive advantages.\nH4b. Organizational agility is a dynamic capability leveraged by BDA\nthat positively in fluences the process-level performance.\nBy engaging the business activities (e.g., sense customer needs, mar-\nket research, R&D) companies can increase the possibility of achieving\nprocess innovation success ( Zollo & Winter, 2002 ). In the IT field some\nauthors focus on the importance of assessing how business processes\ncan bring value to firms (e.g., ( Chen et al., 2014; Tallon, 2007 )). Recent\nconceptual considerations are that BDA is a source of DC (organizational\nagility, speci fically) and that BDA are a way to provide business value to\nfirms ( Erevelles et al., 2016 ). Therefore, the hypothesis is:\nH5. Process-level performance has a positive effect on competitive\nadvantage.\n3.1.3. The mediating role of agility on the relationship between knowledge\nassets and performance\nEarlier IT literature considers that dynamic capabilities can establish\na link between knowledge assets and firm performance ( Sher & Lee,\n2004; Wang, Klein, & Jiang, 2007 ). In the management field some\nauthors examine agility as a mediator between the management of\nknowledge assets and performance ( Chung, 2010; Liu et al., 2014 ).\nAlso, the proposed model suggests a potential mediating role of agility\nin the relationship between knowledge assets and two types ofperformance (process-level performance and competitive advantage).\nThus, additional hypotheses are:\nH6a. Agility positively mediates the relationship between endogenous\nknowledge management and competitive advantage.\nH6b. Agility positively mediates the relationship between exogenous\nknowledge management and competitive advantage.\nH6c. Agility positively mediates the relationship between knowledge\nsharing with partners and competitive advantage.\nH7a. Agility positively mediates the relationship between endogenous\nknowledge management and process-level performance.\nH7b. Agility positively mediates the relationship between exogenous\nknowledge management and process-level performance.\nH7c. Agility positively mediates the relationship between knowledge\nsharing with partners and process-level performance.\n3.1.4. Competitive advantage\nCompetitive advantage exists when a firm reveals having greater\nsuccess compared with its current or potential competitors ( Peteraf &\nBarney, 2003 ). To be consistent with this conceptualization, superior\nfirm performance relative to that of competitors constitutes an empiri-\ncal and common indicator of competitive advantage. ( Barnett, Greve, &\nPark, 1994; Schilke, 2014 ). Based on Schilke's construct ( Schilke, 2014 ),\ncompetitive advantage was operationalized as re flective-re flective type\n(Ringle, Sarstedt, & Straub, 2012 ), with the first-order dimensions of:\n(1) strategic performance (qualitative dimension) and (2) financial per-\nformance (quantitative dimension), both in comparison to competition.\n3.1.5. Controls\nAs literature widely supports, this study uses the industry and the\ncountry in which a firm competes as predictors of competitiveness\n(Schilke, 2014 ). BDA may be particularly useful to firms operating in\nturbulent technological environments ( Wade & Hulland, 2004 ), and\nconsequently, following the approach of Menguc and Auh ( Menguc &\nAuh, 2006 ) and Drnevich and Kriauciunas ( Drnevich & Kriauciunas,\n2011 ), the study includes turbulent technological environment as a con-\ntrol. A turbulent technological environment makes current technology\nobsolete and requires the development of new advances ( Menguc &\nAuh, 2006 ). Finally, we use the variable “time since adoption of BDA ”\nto control for the knowledge and experience that organizations gain\nby using BDA over time ( Elbashir et al., 2013 ). These controls explain\nall dependent variables (agility, process-level performance, and\ncompetitive advantage).\n4. Research design\n4.1. Measurement\nTo test the model ( Fig. 1 ) and the related hypotheses, the study per-\nforms a multi-country survey of European organizations from several\nindustries. Following the recommendations of Moore and Benbasat\n(Moore & Benbasat, 1991 ), the study uses a survey instrument drawing\nupon a comprehensive literature review. Regarding content validity,\nfive established academic IS researchers and two language experts\nreview each item on the questionnaire, assessing its content, scope,\nand purpose ( Brislin, 1970 ). To test the dif ficulty of the questions, to-\ngether with the reliability and validity of the scales, a pilot study uses\na sample of 30 executives from firms not part of the main survey.\nRemoval of some items reduces ambiguity and simpli fies interpretation.\nThe survey instrument and measurement items are in Appendix A.382 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 5]\n4.2. Data\nThe survey was conducted in 2015 using an online survey tool. To\nguarantee the quality of the data, the respondent pro file uses the\nfollowing three criteria: deep knowledge of the organization strategy,\nmore than five years of experience in BI&A/BDA initiatives, and holding\nan IT/business executive or management position in the company. Themailing database comes from Dun & Bradstreet, one of the world's lead-\ningfirms for commercial information and business insight. The initial\nsample of 500 firm executives from European firms receives an email\nto participate in the survey.\nNinety-two valid responses were received in the first month. To\nincrease the response rate a follow-up email was sent. During the\nfollowing months 83 additional valid responses were received from\nlate responders, totaling 175 usable responses (overall response rate\nof 35%). As seen in Table 1 , the sample comprises different industries\nof which almost half are financial firms (40.5%). Regarding firm size,\nthe sample is equally distributed between mid-size and large compa-\nnies. Business (41.4%) and IT executives (58.6%) are well represented.\nNon-response bias was assessed using the sample distributions of the\nearly and late respondent groups compared with the Kolmogorov-\nSmirnov test ( Ryans, 1974 )( s e e Table 2 ). The early respondents were\nidenti fied by selecting the respondents in the first month. The test\nshows that the two groups do not differ statistically (5% signi ficance\nlevel, pN0.05), demonstrating the absence of non-response bias\n(Ryans, 1974 ). Due to the fact that the study collects data simultaneous-\nly from a single source, for the sake of validity, common method bias\nneeds to be assessed. The study uses Harman's post hoc single-factor\nanalysis for this purpose. A factorial analysis of all indicators was con-\nducted and the first extracted factors explain 36.9% of variance. This\nmeans that common method bias is unlikely to be an issue in the data\nPodsakoff et al., 2003 .\n5. Results\nTo estimate the conceptual model, the study uses the partial least\nsquares (PLS) method ( Hair, Ringle, & Sarstedt, 2011 ). PLS ful fills theresearch purpose by examining the validity of the constructs, without\nrequiring normal distributions for the variables. PLS requires a sample\nsize of ten times the number of the largest number of structural paths\ndirected at a particular construct ( Gefen & Straub, 2005 ). In the\nconceptual model the largest number of structural paths directed to a\nparticular construct is three, which means that the minimum sample\nsize should be 30. The sample is larger ( n=1 7 5 ) ,m e a n i n gt h a ti ti sa d -\nequate for PLS. Before testing the structural model, the study analyzes\nthe measurement model in order to assess reliability and validity.\n5.1. Measurement model\nThe study examines indicator reliability, construct reliability, con-\nvergent validity, and discriminant validity in order to assess the mea-\nsurement model. Tables 3 and 4 show the results of the measurement\nmodel. Regarding indicator reliability, only loadings above 0.7 were\nconsidered. Hence, four items (ENKM5, DC1, PLP3-4) were eliminated.\nAsTable 3 reveals, the instrument presents good indicator reliability,\nas the loadings are above 0.70. The composite reliability coef fi\ncient as-\nsesses the construct reliability because construct reliability takes into\nconsideration indicators having different loadings ( Hair et al., 2011;\nHenseler, Ringle, & Sinkovics, 2009 ).Table 4 shows that all constructs\nhave composite reliability above 0.7, which suggests that the constructs\nare reliable. To test convergent validity, the study uses average variance\nextracted (AVE). The AVE should be higher than 0.5, (i.e., the latent var-\niable explains more than half of the variance of its indicators ( Henseler\net al., 2009; Fornell & Larcker, 1981 )).Table 4 shows that all constructs\nmeet this criterion. Regarding discriminant validity, the study uses two\nmeasures: the Fornell-Larcker criterion and cross-loadings. First, ac-\ncording to Fornell and Larcker ( Fornell & Larcker, 1981 ), the square\nroot of AVE should be greater than the correlations with other latent\nvariables. Table 4 shows that the square roots of AVEs (in bold) are\nhigher than the correlation between constructs. All the constructs\nshow evidence of acceptable discrimination. Second, the loading of\neach indicator should be greater than all cross-loadings ( Chin, 1998a )\n(see Table 3 ). Overall, the model has good indicator reliability, construct\nreliability, convergent validity, and discriminant validity. As these\ncriteria are met, the constructs can test the structural model.\n5.2. Structured model\nTo evaluate the structured model, we followed Hair's five-step\napproach ( Hair et al., 2013 ): (1) collinearity assessment, (2) structural\nmodel path coef ficients, (3) coef ficient of determination (R2value),\n(4) effect size f2,a n d( 5 )p r e d i c t i v er e l e v a n c eQ2and blindfolding.\nRegarding collinearity (1), the results suggest minimal collinearity\namong the constructs (the highest VIF among the explanatory variables\nis 2.95), which means the predictors in the structural model do not\nsuffer from this issue. To empirically assess the hypotheses postulated\ninSection 3 , the study examines the level of signi ficance in pathTable 1\nSample pro file.\nSample characteristics (n = 175) Obs. (%)\nRespondent position\nIT executive\nChief Information Of ficer (CIO) 22 12.5%\nIT Director 26 14.8%IT Manager 32 18.2%Other IT executive 23 13.1%\nBusiness executive\nChief Financial Of ficer (CFO) 19 10.9%\nBusiness Manager - Strategic Planning 18 10.3%\nCentral Operations Of ficer (COO) 14 8.0%\nOther Business executive 21 12.0%\nNo. of employees\nb50 14 8.0%\n50–250 76 43.4%\nN250 85 48.5%\nIndustry\nManufacturing 23 13.1%Electricity, gas and water supply activities 11 6.2%Wholesale and retail trade 19 10.8%Transports and telecommunications 18 10.2%Financial intermediation 71 40.5%Others 33 18.8%\nNotes: (1) The firm size is categorised based on European enterprises size classi fication\n[104]; (2) The industries of activity are in accordance with NACE (European standard clas-sification of productive economic activities).Table 2\nTesting possible response bias: early vs. late respondents.\nConstructs Full sample\nN = 175Early\nrespondentsN=9 2Late\nrespondentsN=8 3Kolmogorov-\nSmirnov test\nMean S.D. Mean S.D. Mean S.D. p-Value\nENKM 5.9 0.71 5.9 0.67 5.9 0.75 0.65\nEXKM 5.8 0.86 5.9 0.85 5.7 0.86 0.07\nKSP 4.8 0.89 4.8 0.80 4.7 0.98 0.30AG 6.1 0.93 6.1 0.78 6.0 1.07 0.72PLP 6.1 0.81 6.1 0.78 6.0 0.83 0.23CA 5.9 0.82 6.0 0.72 5.8 0.92 0.34SP 6.0 0.81 6.0 0.72 6.0 0.89 0.76FP 5.9 0.96 6.0 0.81 5.7 1.09 0.16383 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 6]\ncoefficients (2) by means of a bootstrapping technique ( Hair et al.,\n2011; Henseler et al., 2009 ) with 5000 iterations of re-sampling, with\neach bootstrap sample constituted by the number of observations\n(i.e., 175 cases). To have more conservative outcomes, the study uses\ntheno sign change option ( Hair et al., 2013 ).Fig. 2 shows the estimated\nmodel (path coef ficients, R2and Q2), and Table 5 summarizes the\nresults. Concerning R2values (3), all dependent variables present rea-\nsonable values. In addition, this study calculates the f2and q2effect\nsizes (4). Most of the values of f2effect size are small, with the exception\nof agility in process-level-performance and exogenous knowledge\nmanagement in agility (moderate effects). Last, based on a blindfolding\nprocedure, all Q2values are above zero, which means the model has\npredictive power concerning the dependent variables (see Fig. 2 ).\nFig. 2 summarizes the analysis results as follows: the conceptual\nmodel explains 61.8% of the variation in organizational agility. Endoge-\nnous Knowledge Management (EnKM) ( ^β= 0.155; pb0.01) and Exog-\nenous Knowledge Management (ExKM) ( ^β= 0.248; pb0.001) are\nstatistically signi ficant in explaining organizational agility (AG). Thus,\nH1 and H2 are con firmed, whereas knowledge sharing partners (KSP)\n(H3) is not con firmed. Organizational agility (AG) ( ^β= 0.371;\npb0.001) is statistically signi ficant in explaining Process-level Perfor-\nmance (PLP), and consequently H4b is supported. The conceptual\nmodel explains 57.8% of the variation in Process-level Performance\n(PLP). Agility (AG) contributes signi ficantly to explain performance attwo levels: Process-level Performance (PLP) ( ^β= 0.371; p b0.001)\nand Competitive Advantage (CA) ( ^β=0 . 2 0 4 ;p b0.01), which con firms\nH4a and H4b .H5is not supported, as the effect is statistically not signif-\nicant (PLP- NCA). The conceptual model explains 77.8% of the variation\nin Competitive Advantage (CA). The conceptual model substantially ex-\nplains the variation of all three dependent variables ( Chin, 1998b;\nHenseler et al., 2009 ).\n5.3. Mediating effect testing\nBased on the guidelines of Hair ( Hair et al., 2013 ), Preacher\n(Preacher & Hayes, 2008 ), and Nitzl ( Nitzl, Roldán, & Cepeda,\n2016 ), the study evaluates the signi ficance of the mediating effects\nof organizational agility. Mediation analysis is eligible if the indirect\neffect is signi ficant. Table 6 presents the results, which ful fill the nec-\nessary conditions to perform the mediator assessment. Also, the\nstudy calculates variance accounted for (VAF) to determine the size\nof the indirect effect in relation to the total effect ( Hair et al.,\n2013 ). The results show that agility can partially mediate the\nrelationship between knowledge assets (endogenous and exogenous\nknowledge) and performance (process-level performance andcompetitive advantage), thereby supporting H6a,b and H7a,b. No\nmediating effects were found between knowledge sharing withTable 3\nLoadings and cross-loadings for the measurement model.\nConstruct Item ENKM EXKM KSP AG PLP FP SP\nEndogenous knowledge management ENKM1 0.715 0.171 0.270 0.264 0.240 0.266 0.180\nENKM2 0.796 0.092 0.393 0.184 0.094 0.331 0.190\nENKM3 0.915 0.317 0.294 0.450 0.322 0.476 0.371\nENKM4 0.826 0.313 0.135 0.374 0.331 0.508 0.365\nExogenous knowledge management EXKM1 0.086 0.797 -0.183 0.390 0.365 0.328 0.345\nEXKM2 0.214 0.899 -0.136 0.495 0.477 0.446 0.403\nEXKM3 0.397 0.775 0.057 0.444 0.636 0.515 0.434\nKnowledge sharing partners KSP1 0.383 −0.012 0.873 −0.125 −0.140 −0.167 −0.156\nKSP2 0.324 −0.058 0.939 −0.145 −0.185 −0.116 −0.192\nKSP3 0.210 −0.140 0.960 −0.245 −0.276 −0.199 −0.300\nAgility AG2 0.395 0.453 −0.182 0.860 0.576 0.586 0.729\nAG3 0.397 0.482 −0.189 0.931 0.604 0.619 0.665\nAG4 0.402 0.538 −0.085 0.905 0.608 0.607 0.627\nAG5 0.327 0.494 −0.263 0.928 0.590 0.640 0.682\nPerformance at process level PLP1 0.315 0.629 −0.231 0.676 0.951 0.571 0.563\nPLP2 0.308 0.533 −0.204 0.558 0.939 0.525 0.552\nCompetitive advantage Financial performance FP1 0.445 0.501 −0.238 0.675 0.571 0.950 0.728\nFP2 0.531 0.496 −0.071 0.594 0.487 0.949 0.665\nFP3 0.477 0.518 −0.199 0.657 0.594 0.950 0.704\nStrategic performance SP1 0.343 0.363 −0.134 0.615 0.507 0.584 0.840\nSP2 0.327 0.445 −0.298 0.683 0.499 0.719 0.932\nSP3 0.321 0.485 −0.230 0.715 0.590 0.681 0.927\nThefigures in bold represents the cross-loadings for the measurement model.\nTable 4\nCorrelation matrix, composite reliability (CR), and square root of AVEs.\nCR ENKM EXKM KSP AG PLP FP SP\nEndogenous knowledge management (ENKM) 0.89 0.82\nExogenous knowledge management (EXKM) 0.87 0.30 0.83\nKnowledge Sharing with Partners (KSP) 0.95 0.31 −0.09 0.93\nAgility (AG) 0.95 0.42 0.54 −0.20 0.91\nProcess level performance (PLP) 0.94 0.33 0.62 −0.23 0.66 0.95\nFinancial performance (FP) 0.97 0.51 0.54 −0.18 0.68 0.58 0.95\nStrategic performance (SP) 0.93 0.37 0.49 −0.25 0.75 0.59 0.74 0.90\n(1) First column are CR (composite reliability).\n(2) Diagonal elements are square root of average variance extracted (AVE).(3) Off-diagonal elements are correlations.The bold figures represent the square roots of AVEs.384 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 7]\npartners and performance (process-level performance and competi-\ntive advantage), which means H6c and H7c are not con firmed.\n6. Discussion\nAs BDA can generate value in several ways, the need exists to under-\nstand the entire chain. This study fills the research gap by assessing not\nonly the antecedents but also the effects of BDA initiatives in European\nfirms.\nThe results strongly support the claim that BDA applications can\nallow an effective internal and external knowledge management which\ncan help firms to create organizational agility. This agility exists in several\nways: (1) by sensing opportunities and threats (e.g., reacting to new\nproducts or services of competitors); (2) by seizing possible chances\n(e.g., expanding into new regional or international markets), and\n(3) by adjusting to the technological environment to attain competitive\nadvantage (e.g., adopting new technologies to produce products andservices more ef ficiently). This finding is consistent with earlier literature\n(Chen et al., 2014; Liu et al., 2014; Sher & Lee, 2004 ).\nRegarding the antecedents, the results demonstrate that BDA can\nsupport organizational knowledge management, allowing the crea-\ntion/enhancement of dynamic capabilities such as organizational agility.\nThisfinding is consistent with earlier studies applied to IT innovations\nand organizational management (e.g., ( Nieves & Haller, 2014; Sher &\nLee, 2004; Cai et al., 2013; Liu et al., 2014; Cepeda & Vera, 2007 )). The\nresults suggest that exogenous knowledge management deserves\nmore attention, which was considered more important than endoge-\nnous knowledge management. This outcome suggests that BDA\ntechnologies can provide business value by facilitating the acquisition\nof supply chain and marketing knowledge. While knowledge manage-\nment is important to explain BDA value creation, the way of sharing\nthis strategic asset among business partners is not statistically signi fi-\ncant in this study. Although the hypothesis related to the knowledge\nshared with partners ( H3) seems plausible and consistent with earlier\nstudies for other IT innovations (e.g., ( Zhu & Kraemer, 2005; Zheng\nFig. 2. Estimated model. Note: ns = non-signi ficant. ** |t| N=1.96 at p = 0.05; *** |t| N=2 . 5 7a tp=0 . 0 1l e v e l ;* * * *| t | N=3.29 at p = 0.001 level.\nTable 5\nSignificant testing results of the structural model path coef ficients.\nStructural path Path coef ficient (t-value) Effect size (f2) Effect size (q2) 95% con fidence interval Conclusion\nEndKM →AG 0.155⁎⁎\n(2.562)0.038 0.024 [0.032; 0.268] H1supported\nExKM→AG 0.248 ⁎⁎⁎⁎\n(4.556)0.120 0.074 [0.149; 0.364] H2supported\nKSP→AG 0.010 ns\n(0.121)0.000 0.000 [ −0.145; 0.169] H3not supported\nAG→CA 0.204 ⁎⁎⁎\n(2.786)0.064 0.021 [0.065; 0.351] H4a supported\nAG→PLP 0.371⁎⁎⁎⁎\n(3.969)0.125 0.080 [0.173; 0.544] H4b supported\nPLP→CA 0.106 ns\n(1.579)0.021 0.007 [ −0.030; 0.234] H5not supported\nNote: ns = non-signi ficant.\nThe values of f2and q2effects can be considered weak (0.02). moderate (0.15) and strong (0.35).\nConfidence level:\n⁎⁎|t|N=1.96 at p= 0.05 level.\n⁎⁎⁎|t|N=2 . 5 7a t p=0 . 0 1l e v e l .\n⁎⁎⁎⁎ |t|N=3.29 at p= 0.001 level.385 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 8]\net al., 2011; Ruivo, Oliveira, & Neto, 2014 )), this construct does not con-\ntribute to creating valuable organizational agility. An earlier study con-\ncludes that using this type of knowledge is not always useful and can\nharm speci fic business processes in some situations. Moreover, this\nstudy shows that agility can partially mediate the positive effect of\nsome knowledge assets (exogenous and endogenous) and performance\n(process-level performance and competitive advantage) ( H6a,H6b and\nH7a,H6b). This finding is consistent with earlier studies ( Liu et al., 2013;\nLiu et al., 2014; Pavlou & El Sawy, 2006 ).\nCompetitive performance is not only about how much firms know,\nbut how they use what they know ( Haas & Hansen, 2005 ). A possible\nexplanation for this result is that firms are reluctant to share sensitive\ninformation that might compromise their competitive advantage. In\nfact, synergies with business partners can be bene ficial (e.g.,( Setia,\nRichardson, & Smith, 2015 )), but careful attention is needed regarding\nthe shared information. The study shows that knowledge sharing with\npartners can be truly compromising in the areas of Production and Op-\nerations or Product and Service enhancement, which represent the core\nbusiness practices of a firm. An information sharing agreement might be\na solution to overcome this constraint.\nConcerning the effects of agility leveraged by BDA, the results indi-\ncate that this dynamic capability can positively impact competitive ad-\nvantage in different ways (via processes or organizationally), which is\nin line with the findings of other authors ( Drnevich & Kriauciunas,\n2011; Protogerou et al., 2012 )(H4a,b). Agility can also be more effective\nin improving speci fic business processes than organizational perfor-\nmance, which is consistent with Drnevich and Kriauciunas ( Drnevich\n& Kriauciunas, 2011 ).The results demonstrate that no signi ficant link\nexists between process-level performance and competitive advantage\n(H5). In this sense, Drnevich and Kriauciunas ( Drnevich & Kriauciunas,\n2011 ) argue that a firm's performance depends on a set of elements\nthat might fail due to miscommunication between the business areas\nand the top management. Although some business areas can behave\nin an ef ficient way, this ef ficiency does not necessarily have a signi ficant\neffect on the overall performance.\nAlthough BDA technologies are generaly associated with customer\nmanagement or marketing areas, results indicate that, in general,\nEuropean firms focus more on internally improving their assets\n(products and services) and the way that these are being produced to\noptimize costs. With Europe still showing signs of financial crisis, this\nfinding might point the way to a change of survival strategy in compet-\nitive markets.\n6.1. Limitations and further research\nCertain limitations apply to the interpretation of the results of this\nstudy. First, the antecedents of agility do not extend beyond the speci fic\nknowledge resources included in the model. Other factors can also\ndetermine the development of this dynamic capability in European\nfirms. Future studies may include these resources as variables of themodel or by moderating existing variables. Second, although the study\nconsiders constructs in the model embedding the impact of BDA at\nprocess-level, the model is firm-level. Before generalization is possible,\nresearchers should perform a longitudinal study based on the process\napproach. Future research should use speci ficp r o c e s sc o n s t r u c t st o\nassess the impact of BDA on several business areas in detail. Third, due\nto the perceptual nature of the measures used, future studies should\nidentify the issues associated with cross-sectional research design.\nAlthough the use of objective measures to assess firm performance is\nimportant, in this study companies were reluctant to provide them.\nFourth, although the sample size is statistically adequate, a larger\nsample could be useful to reinforce the conclusions of this study.\nAs researchers generally accept that BDA can provide bene fits to all\nEuropean firms ( European_Commission, 2015 ) across several indus-\ntries, reinforced on a McKinsey survey ( Manyika et al., 2011b )r e p o r t s\nthat most industries in Europe have the capacity to store and manipu-\nlate big data, and consequently the potential value of using big data\nresides mainly in developed countries. Therefore, data from five\nEuropean developed countries were collected. By conducting future\nstudies in more countries and industries, which may have different per-\nceptions of BDA and diverse external contexts, the understanding of\nBDA business value could likely improve. Due to their different cultures,\nresearch to perform a comparative study among European regions\n(e.g., Northern and Southern Europe) could be interesting.\n6.2. Theoretical implications\nThis study offers two key contributions that extend theory on BDA in\ntechnology and organizational management research:\n(1)BDA value chain understanding - Despite the potential bene fits,\nsome firms fail to capture value from BDA initiatives ( Kaisler\net al., 2013 ). Recent papers focus on BDA research opportunities\n(Abbasi et al., 2016; Agarwal & Dhar, 2014 ), claiming that there is\na need to conduct assessments of the actual impact of BDA\ninvestments and use, and to understand how to achieve the\nbenefits for performance. The BDA value chain remains relatively\nunexplored and requires further investigation. The current paper\nresponds to the calls of scholars by empirically assessing the\nvalue that BDA can bring to European firms. This study theoreti-\ncally proposes and empirically validates a conceptual model\nbased on strategic management theories (KBV and DC), never\nbefore combined for this purpose, to explain the full BDA value\nchain. Liu ( L i ue ta l . ,2 0 1 4 ) argues that literature about the\nrelationship among knowledge management, organizational\nagility, and firm performance is still limited. This is the first\nstudy that empirically demonstrates that BDA applications\nbased on an effective knowledge management can help firms to\ncreate organizational agility leading to competitive advantage.\nFurther studies could bene ficially use this theoretical framework\nto assess the business value in other IT innovations at a process-Table 6\nMediation test by bootstrapping approach.\nEffect of Direct effect (t-value) Indirect effect (t-value) Total effect VAF (%) Interpretation Conclusion\nEnKM→AG→CA 0.137 ⁎⁎(2.317) 0.053 ⁎⁎(2.156) 0.190 ⁎⁎⁎⁎(3.577) 27.89% Partial mediation H6a supported\nExKM→AG→CA 0.081 ns (1.506) 0.097 ⁎⁎⁎(2.617) 0.178 ⁎⁎⁎⁎(4.037) 54.49% Partial mediation H6b supported\nKSP→AG→CA 0.026 ns (0.464) −0.014 ns (0.607) 0.012 ns (0.199) na No mediation H6cnot supported\nEnKM→AG→PLP 0.141 ⁎⁎(1.988) 0.057 ⁎⁎(2.212) 0.198 ⁎⁎⁎(2.813) 28.79% Partial mediation H7a supported\nExKM→AG→PLP 0.344⁎⁎⁎⁎(5.412) 0.092⁎⁎⁎(3.041) 0.436⁎⁎⁎(7.219) 21.10% Partial mediation H7b supported\nKSP→AG→PLP −0.157⁎⁎(2.408) 0.003 ns (0.119) −0.154⁎⁎(2.172) na No mediation H7cnot supported\nNote: VAF = variance accounted for. The VAF N80% indicates full mediation. 20% ≤VAF≥80% show partial mediation. VAF b20% indicates no mediation. ns = non-signi ficant. na = not\napplicable.\n⁎⁎|t|N=1.96 at p = 0.05 level.\n⁎⁎⁎|t|N= 2.57 at p = 0.01 level.\n⁎⁎⁎⁎ |t|N=3.29 at p = 0.001 level.386 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 9]\nlevel and firm-level. Academics can make use of this paper for\npedagogical support for teaching about BDA value chain.\n(2)DC literature –This paper contributes to DC research by empir-\nically testing agility business value in a BDA context ( Drnevich\n& Kriauciunas, 2011 ). The results strongly support the belief\nthat BDA technologies can trigger agility and that agility can af-\nfect competiveness in two ways (via processes or globally). AsBDA can signi ficantly improve business processes ( Davenport,\n2006 ), business process enhancement driven by BDA is an im-\nportant research area ( Abbasi et al., 2016 ). Earlier studies focus\nonly on the link between agility and firm performance ( Chen\net al., 2014; Liu et al., 2014; Tallon & Pinsonneault, 2011 ), while\nthis study empirically demonstrates that an effect of agility exists\nat the process-level, too. In addition, despite an increasing use of\nmediation testing, most of the studies in PLS-SEM do not analyze\nmediation effects ( Hair et al., 2013; Nitzl et al., 2016 ). Under-\nstanding mediation issues can be crucial for researchers because\nthey can better explain or hinder the in fluence of a third variable\nin the relationship between two variables in a model ( Cepeda &\nVera, 2007 ). This study demonstrates that agility can be a\nmediator between external and internal knowledge assets and\nperformance (process-level performance and competitive\nadvantage).\n6.3. Managerial implications\nFor practitioners (including executives and IT managers) this study\ndemonstrates how best to leverage the knowledge embedded in BDA\nsystems and initiatives and achieve capabilities that will help to main-\ntain competitive advantages. The paper provides support to justify\nBDA investments and initiatives. The results indicate that although\nBDA technologies call for substantial investment in implementation\nand maintenance, European firms are aware of BDA's potential value\nand bene fits. Executives should apply these guidelines to their organiza-\ntional IT strategy.\nBDA can provide value at several stages: (1) knowledge; (2) dynamic\ncapability (organizational agility); (3) business process; and (4) com-\npetitive performance. To initiate the value creation process, firms\nshould invest in an effective BDA program. First, the value that BDA\ncan provide derives first from the way firms use the technologies\navailable to manage knowledge. An effective training program can\nhelp to leverage the way users extract and manage knowledge. Second,\nby effectively using BDA, firms can acquire capabilities to innovate and\nrapidly adjust to external demands (e.g., optimize business processes).\nThird, these capabilities will encourage speci fic business areas to\ninvolve the whole organization, when an effective bottom-up strategy\nis followed, supported by good communication practices. By applying\nthis framework to BDA speci fically, managers and IT executives can\nbenefit from a performance metric that uniquely speci fies the impact\nof BDA. By evaluating the organizational knowledge conversion into\nprocess and firm-level capabilities, practitioners can increase their\nproductivity. Software vendors of BDA can also gain a better under-\nstanding of how European firms can invest and experience the value\ncreated through BDA. They can natively embed BDA capabilities in\ntheir solutions as a way for their customers to achieve superior financial\nand strategic performance. Finally, firms that have not yet decided to\nadopt these technologies can gain a perception of what is possible by\nadopting and effectively using BDA.\n6.4. Business research implications\nThe business community now sees big data as a potential tool of\nbusiness value for achieving competitive advantage. This value can\nonly be real if companies know how to effectively manage Big Data An-\nalytics (BDA) initiatives. This paper establishes a first link between BDAprocess-level performance and competitive advantage, by merging the\nfield of information systems and strategic management. By presenting\nand discussing strategic and organizational drivers and impacts of\nBDA, guidance to business researchers, practitioners, and scholars is\nprovided. As such, this paper extends knowledge by directly evaluating\nthe effect of BDA on the decision-making process to support an effective\nIT resource management, focusing on challenges for adoption, gover-nance, and evaluation.\nThe outcomes of this paper indicate that BDA can be an effective\naid to survival in competitive markets, particularly by supporting\nProduction and Operations or P roduct and Service enhancement.\nStriving to overcome damages of the financial crisis, European\nfirms are using BDA tools to internally improve their assets (products\nand services) and the way that these are being produced to optimize\ncosts. European firms tend to attribute greater value to external\nknowledge provided by BDA applications than to internal knowledge\nmanagement. Sharing knowledge with business partners is poten-\ntially harmful to organizational productivity, so careful attention is\nin order when exchanging this type of core data between companies.\nAlso, this study concludes that organizational agility leads directly to\na better performance (process-level and competitive advantage) but\ncan mediate effects from knowledge assets on performance. This\nmeans that fir m sm u s tb e a ri nm i n dt h a ts e v e r a lp a t h sc a nl e a dt o\ncompetitive advantage. First, managers should consider investing\nin BDA technologies to take advantage of internal and external\nknowledge resources. Second, by governing the knowledge extract-\ned by BDA, agility becomes the “ultimate ”organizational capability\nthat leads to sustainable compet itive advantages. Firms should\nconfidently invest in the development of agility supported by BDA\ntools.\n7. Conclusions\nAs Big Data Analytics (BDA) can offer value to companies in\nseveral ways, many scholars highlight the need to understand the\npath to competitive advantage. The main outcome emerging from\nthis paper has to do with understanding the value chain of BDA.\nGrounded on knowledge-based view (KBV) and dynamic capabilities\n(DC), this study fills a research gap from the strategic management\nperspective, by perceiving the antecedents (knowledge assets) and\nthe impacts (on process-level performance and competitive advan-\ntage) of BDA initiatives in European firms. The results show that\nthe model signi ficantly explains all dependent variables (61.8% of\nagility variation, 57.8% of process-level performance variation, and\n77.8% of competitive advantage variation). The major conclusions\nof this study are:\na) BDA can be a strategic investment for European firms to enhance or-\nganizational agility and survive in competitive markets. Firms\nshould invest in the development of organizational agility supported\nby effective BDA applications.\nb) To create agility, European firms tend to believe that the external\nknowledge deriving from BDA applications can be more effective\nin the creation of agility than internal knowledge. Sharing knowl-\nedge with business partners is problematic, as sharing, is a potential\nbarrier for process-level performance.\nc) Regarding the impacts of agility, this capability leads directly to a\nbetter performance (process-level and competitive advantage) but\ncan mediate effects from knowledge assets on performance. This\nmeans that BDA initiatives can lead to better operational ef ficiency,\nbut several paths can lead to competitive advantage.\nThus, a crucial need exists for firms to have an integrated view of the\nBDA chain in order to be able to fully leverage the innovative power of\nBDA capabilities to achieve competitive advantage.387 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 10]\nAppendix A. Survey questionnaire\nReferences\nAbbasi, A., Sarker, S., & Chiang, R. H. (2016). Big Data research in information systems: To-\nward an inclusive research agenda. Journal of the Association for Information Systems ,\n17(2), 3.\nAgarwal, R., & Dhar, V. (2014). Editorial —Big Data, data science, and analytics: The oppor-\ntunity and challenge for IS research. Information Systems Research ,25(3), 443 –448.\nAmbrosini, V., & Bowman, C. (2009). What are dynamic capabilities and are they a useful\nconstruct in strategic management? International Journal of Management Reviews ,\n11(1), 29 –49.\nArend, R., & Bromiley, P. (2009). Assessing the dynamic capabilities view: spare change,\neveryone? Strategic Organization ,7(1), 75.\nBarnett, W. P., Greve, H. R., & Park, D. Y. (1994). An evolutionary model of organizational\nperformance. Strategic Management Journal ,15(S1), 11 –28.\nBarton, D. (2012). Making advanced analytics work for you. Harvard Business Review ,90,\n78–83.\nBarua, A., Kriebel, C. H., & Mukhopadhyay, T. (1995). Information technologies and busi-\nness value: An analytic and empirical investigation. Information Systems Research ,\n6(1), 3 –23.\nBharadwaj, A. S. (2000). A resource-based perspective on information technology\ncapability and firm performance: An empirical investigation. MIS Quarterly ,24(1),\n169–196.Blome, C., Schoenherr, T., & Rexhausen, D. (2013). Antecedents and enablers of supply\nchain agility and its effect on performance: A dynamic capabilities perspective.\nInternational Journal of Production Research ,51(4), 1295 –1318.\nBrislin, R. W. (1970). Back-translation for cross-cultural research. Journal of Cross-Cultural\nPsychology ,1(3), 185 –216.\nCai, Z., et al. (2013). Developing organizational agility through IT capability and KM\ncapability. The moderating effects of organizational climate .P A C I S .\nCepeda, G., & Vera, D. (2007). Dynamic capabilities and operational capabilities: A\nknowledge management perspective. Journal of Business Research ,60(5),\n426–437.\nChau, M., & Xu, J. (2012). Business intelligence in blogs: Understanding consumer interac-\ntions and communities. MIS Quarterly ,36(4), 1189 –1216.\nChen, H., Chiang, R., & Storey, V. (2012). Business intelligence and analytics: From Big\nData to big impact. MIS Quarterly ,36(4), 1165 –1188.\nChen, Y., et al. (2014). IT capability and organizational performance: The roles of business\nprocess agility and environmental factors. European Journal of Information Systems ,\n23(3), 326 –342.\nChin, W. W. (1998a). Commentary: Issues and opinion on structural equation modeling.\nJSTOR, 7 –16.\nChin, W. W. (1998b). The partial least squares approach for structural equation modeling.\nChung, T. R. (2010). Knowledge creation and firm performance. In e. (Ed.), Mediating\nprocesses from an organizational agility perspective .A M C I S .Constructs Items Source\nKnowledge assets Please indicate the extent to which these forms of knowledge are used in your organization.\nBDA technologies:\nEndogenous knowledge\nManagementENKM1. Reduce uncertainties of knowledge loss\nENKM2. Reduce dependence on speci fic personnel\nENKM3. Are comprehensively utilized by members in organizationENKM4. Are comprehensively constructed in organization*(Sher & Lee, 2004 )\nExogenous knowledge\nManagementEXKM1. Facilitate acquisition of supply chain knowledge\nEXKM2. Facilitate processing of supply chain knowledge\nEXKM3. Facilitate processing of marketing knowledge(Sher & Lee, 2004 )\nKnowledge sharing with channel\npartnersKSP1. We frequently share knowledge about our business environment\n(e.g., other business relationships) with our channel partners.KSP2. Knowledge about all of our channel partners, competitors, etc., is shared with ourother channel partners.KSP3. Business insights are exchanged between us and our other channel partners.(Liu et al., 2014 )\nOrganizational agility (dynamic\ncapability)Please indicate the degree to which the use of BDA tools in the last three years has helped to:\nAG1. Respond to changes in aggregate consumer demand.*\nAG2. React to new product or service launches by competitors.\nAG3. Expand into new regional or international markets.AG4. Change (i.e., expand or reduce) the variety of products/services available for sale.AG5. Adopt new technologies to produce better, faster, and cheaper products and services.(Lu & Ramamurthy, 2011 )\nProcess-level performance To what extent has BDA been used to support critical business activities in each of the following\nprocesses in the last three years. A sampling of critical activities in each process is shown below.PLP1. Production and operations: improve throughout, boost labour productivity, improve flexibility\nand equipment utilisation, and streamline operations.\nPLP2. Product and service enhancement: embed IT in products, increase pace of development/R&D,\nmonitor design cost, improve quality, support innovation.PLP3. Marketing and sales: spot market trends, anticipate customer needs, build market share,improve forecast accuracy, and evaluate pricing options.*PLP4. Customer relations: respond to customer needs, provide after-sales service and support, improvedistribution, create customer loyalty*(Peteraf & Barney, 2003 )\nCompetitive advantage Please indicate the degree to which you agree with the following statements.\nStrategic Performance\nSP1. We have gained strategic advantages over our competitorsSP2. We have a large market share.SP3. Overall, we are more successful than our major competitors.Financial performanceFP1. Our EBIT (earnings before interest and taxes) is continuously above industry average.FP2. Our ROI (return on investment) is continuously above industry average.FP3. Our ROS (return on sales) is continuously above industry average.(Schilke, 2014 )\nControl variables\nTime since BDA adoption Number of years since adoption (#)Country CountryIndustry Type of industryTechnological turbulence Please indicate the degree to which you agree with the following statements.\nTT1. Extent of technological turbulence in the environment.TT2. Leadership in product/process innovation.\nTT3. Impact of new technology on operations.(Brislin, 1970 )\nNotes: (1) * items eliminated due low loading. (2) Items were measured using a 7-point numerical scale (1 is Strongly Disagree and 7 is Strongly Agree).388 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 11]\nCorte Real, N., Oliveira, T., & Ruivo, P. (2014). Understanding the hidden value of business\nintelligence and analytics (BI&A). Twentieth American Conference of Information\nSystems . Savannah, Georgia: Association of Information Systems.\nDavenport, T. H. (2006). Competing on analytics. Harvard Business Review ,84,1–12.\nDella Corte, V., & Del Gaudio, G. (2012). Dynamic capabilities: A still unexplored issue\nwith growing complexity. Corporate Ownership and Control ,9,3 2 7 –338.\nDrnevich, P. L., & Kriauciunas, A. P. (2011). Clarifying the conditions and limits of the con-\ntributions of ordinary and dynamic capabilities to relative firm performance. Strategic\nManagement Journal ,32(3), 254 –279.\nElbashir, M. Z., et al. (2013). Enhancing the business value of business intelligence: The role\nof shared knowledge and assimilation. Journal of Information Systems ,27(2), 87 –105.\nErevelles, S., Fukawa, N., & Swayne, L. (2016). Big Data consumer analytics and the trans-\nformation of marketing. Journal of Business Research ,69(2), 897 –904.\nErickson, S., & Rothberg, H. (2015). Big Data and knowledge management: Establishing a\nconceptual foundation. Leading issues in knowledge management. Vol. Two . (pp. 204) 2.\nEuropean_Commission (2015). Towards a thriving data-driven economy. Accessed on:\n30th December 2015]; Available from http://ec.europa.eu/digital-agenda/en/\ntowards-thriving-data-driven-economy#Article\nFornell, C., & Larcker, D. F. (1981). Evaluating structural equation models with unobserv-\nable variables and measurement error. Journal of Marketing Research ,18,3 7 5 –381.\nGefen, D., & Straub, D. (2005). A practical guide to factorial validity using PLS-Graph: Tu-\ntorial and annotated example. Communications of the Association for Information\nSystems ,16(1), 5.\nGoldman, S. L., Nagel, R. N., & Preiss, K. (1995). Agile competitors and virtual organizations:\nStrategies for enriching the customer. Van Nostrand Reinhold.\nGrant, R. M. (1996). Prospering in dynamically-competitive environments: Organization-\nal capability as knowledge integration. Organization Science ,7(4), 375 –387.\nHaas, M. R., & Hansen, M. T. (2005). When using knowledge can hurt performance: The\nvalue of organizational capabilities in a management consulting company. Strategic\nManagement Journal ,26(1), 1 –24.\nHair, J. F., Ringle, C. M., & Sarstedt, M. (2011). PLS-SEM: Indeed a silver bullet. Journal of\nMarketing Theory and Practice ,19(2), 139 –152.\nHair, J. F., Jr., et al. (2013). A primer on partial least squares structural equation modeling\n(PLS-SEM). Sage Publications.\nHelfat, C., & Peteraf, M. (2009). Understanding dynamic capabilities: Progress along a de-\nvelopmental path. Strategic Organization ,7(1), 91.\nHelfat, C. E., et al. (2009). Dynamic capabilities: Understanding strategic change in organiza-\ntions. John Wiley & Sons.\nHenseler, J., Ringle, C. M., & Sinkovics, R. R. (2009). The use of partial least squares path\nmodeling in international marketing. Advances in International Marketing (AIM) ,20,\n277–320.\nIDC (2011). Big Data analytics. Future architectures, skills and roadmaps for the CIO .\nKaisler, S., et al. (2013). Big Data: Issues and challenges moving forward. In system sci-\nences (HICSS). 2013 46th Hawaii International Conference on System Sciences .I E E E .\nKwon, O., Lee, N., & Shin, B. (2014). Data quality management, data usage experience and\nacquisition intention of Big Data analytics. International Journal of Information\nManagement ,34(3), 387 –394.\nLaValle, S., et al. (2011). Big Data, analytics and the path from insights to value. MIT Sloan\nManagement Review ,52(2), 21 –31.\nLiu, H., Song, D., & Cai, Z. (2014). Knowledge management capability and firm performance:\nThe mediating role of organizational agility. PACIS.\nLiu, H., et al. (2013). The impact of IT capabilities on firm performance: The mediating\nroles of absorptive capacity and supply chain agility. Decision Support Systems ,\n54(3), 1452 –1462.\nLorenzoni, G., & Lipparini, A. (1999). The leveraging of inter firm relationships as a distinc-\ntive organizational capability: A longitudinal study. Strategic Management Journal ,\n20(4), 317 –338.\nLu, Y., & Ramamurthy, K. (2011). Understanding the link between information technology\ncapability and organizational agility: An empirical examination. MIS Quarterly ,35(4),\n931–954.\nMalladi, S. (2013). Adoption of business intelligence & analytics in organizations –An em-\npirical study of antecedents. 19th American Conference on Information Systems\n(AMCIS) Chicago, Illinois.\nManyika, J., et al. (2011a). In M.G. Institute (Ed.), Big Data: The next frontier for innovation,\ncompetition and productivity .M c K i n s e yG l o b a lI n s t i t u t e .\nManyika, J., et al. (2011b). Big Data: The next frontier for innovation competition and\nproductivity. McKinsey Global Institute.\nMata, F. J., Fuerst, W. L., & Barney, J. B. (1995). Information technology and sustained\ncompetitive advantage: A resource-based analysis. MIS Quarterly ,19(4), 487 –505.\nMelville, N., Kraemer, K., & Gurbaxani, V. (2004). Information technology and organiza-\ntional performance: An integrative model of IT business value. MIS Quarterly ,28(2),\n283–322.\nMenguc, B., & Auh, S. (2006). Creating a firm-level dynamic capability through capitaliz-\ning on market orientation and innovativeness. Journal of the Academy of Marketing\nScience ,34(1), 63 –73.\nMoore, G. C., & Benbasat, I. (1991). Development of an instrument to measure the percep-\ntions of adopting an information technology innovation. Information Systems\nResearch ,2(3), 192 –222.\nMorabito, V. (2015). Big Data and analytics: Strategic and organizational impacts. Springer.\nNieves, J., & Haller, S. (2014). Building dynamic capabilities through knowledge resources.\nTourism Management ,40,2 2 4 –232.\nNitzl, C., Roldán, J. L., & Cepeda, G. (2016). Mediation analyses in partial least squares\nstructural equation modeling. Helping researchers discuss more sophisticated models\n(pp. 3 –21).\nNonaka, I. (1995). The knowledge-creating company: How Japanese companies create the\ndynamics of innovation. Oxford University Press.Pavlou, P. A., & El Sawy, O. A. (2006). From IT leveraging competence to competitive ad-\nvantage in turbulent environments: The case of new product development.\nInformation Systems Research ,17(3), 198 –227.\nPavlou, P. A., & El Sawy, O. A. (2011). Understanding the elusive black box of dynamic ca-\npabilities. Decision Sciences ,42(1), 239 –273.\nPavlou, P. A., et al. (2005). Measuring the return on information technology: A\nknowledge-based approach for revenue allocation at the process and firm level.\nJournal of the Association for Information Systems ,6(7), 199 –226.\nPeteraf, M. A., & Barney, J. B. (2003). Unraveling the resource-based tangle. Managerial\nand Decision Economics ,24(4), 309 –323.\nPettigrew, A. M., Thomas, H., & Whittington, R. (2001). Handbook of strategy and manage-\nment. Sage.\nPodsakoff, P. M., et al. (2003). Common method biases in behavioral research: A critical\nreview of the literature and recommended remedies. Journal of Applied Psychology ,\n88(5), 879.\nPopovi č, A., et al. (2012). Towards business intelligence systems success: Effects of\nmaturity and culture on analytical decision making. Decision Support Systems ,54,\n729–739.\nPreacher, K. J., & Hayes, A. F. (2008). Asymptotic and resampling strategies for assessing\nand comparing indirect effects in multiple mediator models. Behavior Research\nMethods ,40(3), 879 –891.\nProtogerou, A., Caloghirou, Y., & Lioukas, S. (2012). Dynamic capabilities and their indirect\nimpact on firm performance. Industrial and Corporate Change ,21(3), 615 –647.\nRajpathak, T., & Narsingpurkar, A. (2013). Managing knowledge from Big Data analytics in\nproduct development. Tata Consulting, 11.\nRingle, C. M., Sarstedt, M., & Straub, D. (2012). A critical look at the use of PLS-SEM in MIS\nquarterly. MIS Quarterly (MISQ) ,3 6 ( 1 ) .\nRuggles, R. (1998). The state of the notion: Knowledge management in practice. California\nManagement Review ,40(3), 80 –89.\nR u i v o ,P . ,O l i v e i r a ,T . ,&N e t o ,M .( 2 0 1 4 ) . Examine ERP post-implementation stages of use\nand value: Empirical evidence from Portuguese SMEs. International Journal of\nAccounting Information Systems ,15(2), 166 –184.\nRuivo, P., Oliveira, T., & Neto, M. (2015). Using resource-based view theory to assess the\nvalue of ERP commercial-packages in SMEs. Computers in Industry ,73,1 0 5 –116.\nRussom, P. (2011). Big Data analytics. Fourth Quarter: TDWI Best Practices Report.\nRyans, A. B. (1974). Estimating consumer preferences for a new durable brand in an\nestablished product class. Journal of Marketing Research ,4 3 4 –443.\nSambamurthy, V., Bharadwaj, A., & Grover, V. (2003). Shaping agility through digital op-\ntions: Reconceptualizing the role of information technology in contemporary firms.\nMIS Quarterly ,2 3 7 –263.\nSambamurthy, V., et al. (2007). IT-enabled organizational agility and firms' sustainable\ncompetitive advantage. ICIS 2007 proceedings (pp. 91).\nSaraf, N., Langdon, C. S., & Gosain, S. (2007). IS application capabilities and relational value\nin inter firm partnerships. Information Systems Research ,18(3), 320 –339.\nSAS (2013). Big Data analytics. An assessment of demand for labour and skills, 2012 –2017 .\nSchilke, O. (2014). On the contingent value of dynamic capabilities for competitive advan-\ntage: The nonlinear moderating effect of environmental dynamism. Strategic\nManagement Journal ,35(2), 179 –203.\nSchryen, G. (2013). Revisiting IS business value research: What we already know, what\nwe still need to know, and how we can get there. European Journal of Information Sys-\ntems,22(2), 139 –169.\nSetia, P., Richardson, V., & Smith, R. J. (2015). Business value of partner's IT intensity:\nValue co-creation and appropriation between customers and suppliers. Electronic\nMarkets ,1–16.\nShanks, G., & Bekmamedova, N. (2013). Creating value with business analytics in the sup-\nply chain. European Conference of Information Systems. Utrecht: European Conference\non Information Systems .\nShanks, G., & Sharma, R. (2011). Creating value from business analytics systems: The im-\npact of strategy. 15th Paci fic Asia Conference on Information Systems: Quality Research\nin Paci fic, PACIS 2011 (pp. 1 –12). Queensland: Queensland University of Technology.\nSharma, R., Mithas, S., & Kankanhalli, A. (2014). Transforming decision-making processes:\nA research agenda for understanding the impact of business analytics on organisa-\ntions. European Journal of Information Systems ,23(4), 433 –441.\nS h e r ,P .J . ,&L e e ,V .C .( 2 0 0 4 ) . Information technology as a facilitator for enhancing\ndynamic capabilities through knowledge management. Information & Management ,\n41(8), 933 –945.\nSoh, C., & Markus, M. L. (1995). How IT creates business value: A process theory synthesis.\nInternational Conference of Information Systems . ICIS Proceedings.\nTallon, P. P. (2007). A process-oriented perspective on the alignment of information\ntechnology and business strategy. Journal of Management Information Systems ,\n24(3), 227 –268.\nTallon, P. P., & Pinsonneault, A. (2011). Competing perspectives on the link between stra-\ntegic information technology alignment and organizational agility: Insights from a\nmediation model. MIS Quarterly , 35(2).\nTeece, D. J. (2007). Explicating dynamic capabilities: The nature and microfoundations of\n(sustainable) enterprise performance. Strategic Management Journal ,28(13), 1319 –1350.\nTeece, D., Peteraf, M. A., & Leih, S. (2016). Dynamic capabilities and organizational agility:\nRisk, uncertainty and entrepreneurial management in the innovation economy. Un-\ncertainty and Entrepreneurial Management in the Innovation Economy (April 7, 2016) .\nTeece, D. J., Pisano, G., & Shuen, A. (1997). Dynamic capabilities and strategic manage-\nment. Strategic Management Journal ,18(7), 509 –533.\nVolberda, H. W. (1996). Toward the flexible form: How to remain vital in hypercompet-\nitive environments. Organization Science ,7(4), 359 –374.\nWade, M., & Hulland, J. (2004). Review: The resource-based view and information sys-\ntems research: Review, extension, and suggestions for future research. MIS\nQuarterly ,28(1), 107 –142.389 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n\n[Página 12]\nWang, C. L., & Ahmed, P. K. (2007). Dynamic capabilities: A review and research agenda.\nInternational Journal of Management Reviews ,9(1), 31 –51.\nWang, E., Klein, G., & Jiang, J. J. (2007). IT support in manufacturing firms for a knowledge\nmanagement dynamic capability link to performance. International Journal of\nProduction Research ,45(11), 2419 –2434.\nWeill, P., Subramani, M., & Broadbent, M. (2002). Building IT infrastructure for strategic\nagility. MIT Sloan Management Review ,44(1), 57.\nWu, L. -Y. (2006). Resources, dynamic capabilities and performance in a dynamic envi-\nronment: Perceptions in Taiwanese IT enterprises. Information & Management ,\n43(4), 447 –454.\nXu, Z., Frankwick, G. L., & Ramirez, E. (2016). Effects of big data analytics and traditional\nmarketing analytics on new product success: A knowledge fusion perspective.\nJournal of Business Research ,69(5), 1562 –1566.Zheng, S., Zhang, W., & Du, J. (2011). Knowledge-based dynamic capabilities and innova-\ntion in networked environments. Journal of Knowledge Management ,15(6),\n1035 –1051.\nZhou, K. Z., & Wu, F. (2010). Technological capability, strategic flexibility, and product in-\nnovation. Strategic Management Journal ,31(5), 547 –561.\nZhu, K., & Kraemer, K. (2005). Post-adoption variations in usage and value of e-business\nby organizations: Cross-country evidence from the retail industry. Information\nSystems Research ,16(1), 61 –84.\nZollo, M., & Winter, S. G. (2002). Deliberate learning and the evolution of dynamic\ncapabilities. Organization Science ,13(3), 339 –351.\nZott, C. (2003). Dynamic capabilities and the emergence of intraindustry differential firm\nperformance: Insights from a simulation study. Strategic Management Journal ,24(2),\n97–125.390 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390",
+ "cb2913fe-57a1-489a-8966-be97b8b4a2c0": {
+ "content": "Assessing business value of Big Data Analytics in European firms☆\nNadine Côrte-Real ⁎, Tiago Oliveira, Pedro Ruivo\nNOVA IMS, Universidade Nova de Lisboa, 1070-312, Lisboa, Portugal\nabstract article info\nAvailable online 9 August 2016 In the strategic management field, dynamic capabilities (DC) such as organizational agility are considered to be\nparamount in the search for competitive advantage. Recent research claims that IT business value research\nneeds a more dynamic perspective. In particular, the Big Data Analytics (BDA) value chain remains unexplored.\nTo assess BDA value, a conceptual model is proposed based on a knowledge-based view and DC theories. Toempirically test this model, the study addresses a survey to a wide range of 500 European firms and their IT\nand business executives. Results show that BDA can provide business value to several stages of the value chain.\nBDA can create organizational agility through knowledge management and its impact on process and\ncompetitive advantage. Also, this paper demonstrates that agility can partially mediate the effect betweenknowledge assets and performance (process level and competitive advantage). The model explains 77.8% of\nthe variation in competitive advantage. The current paper also presents theoretical and practical implications\nof this study, and the study's limitations.\n© 2016 Elsevier Inc. All rights reserved.Keywords:\nBig Data Analytics (BDA)\nIT business value\nKnowledge Based View (KBV)Dynamic capabilities (DC)Organizational agilityCompetitive advantage\n1. Introduction\nIn the era of Big Data, firms in every sector are required to deal with a\nhuge amount of data. Data in vast amounts can offer invaluable insights\nand competitive advantage if the right technological and organizational\nresources support them ( Morabito, 2015 ). Recently, several academics\nand practitioners have stressed the need to understand how, why, and\nwhen Big Data Analytics (BDA) applications can be a valuable resource\nfor companies to gain competitive advantage ( Abbasi, Sarker, &\nChiang, 2016; Agarwal & Dhar, 2014; Corte Real, Oliveira, & Ruivo,\n2014; LaValle et al., 2011 ). Although BDA technologies have been\nrecognized as the “next big thing for innovation ”(i.e., a potential source\nof business value and competitive advantage), the BDA value chain\nremains relatively unexplored and needs further investigation. No\nempirical research exists assessing how BDA can bring business value\n(Abbasi et al., 2016 ), establishing a linkage between knowledge assets,\norganizational agility, and performance (process-level and competitive\nadvantage) ( Corte Real et al., 2014 ). Firms that inject BDA in their\nbusiness operations can surpass their peers by 5% in productivity and\n6% in pro fitability ( Barton, 2012 ). For that reason, European firms are\ninvesting heavily in BDA technologies ( SAS, 2013; Sharma, Mithas, &\nKankanhalli, 2014 ). Nevertheless, this investment can only be valuableif organizations use the appropriate technology and organizational\nresources to achieve competitive advantage ( Manyika et al., 2011a ).\nIn response to the scarcity of research on this subject, this study\nexamines the impact of BDA on the business value chain in a\nEuropean context by empirically testing a new theoretical frame-\nwork that merges two strategic management theories (Knowledge\nB a s e dV i e w( K B V )a n dd y n a m ic capabilities (DC)) at firm-level. Not\nonly does this paper extend BDA research by transposing, merging,\nand examining hypotheses in IT innovations and management fields,\nbut also contributes to DC research by empirically assessing the ante-\ncedents and impacts of a speci fic dynamic capability (organizational\nagility), when using BDA technologies. This is the first paper that\nstudies the entire BDA value chain at firm-level, linking concepts of\nknowledge management, agility, and performance (process-level\nand competitive advantage). To clarify the role of agility on perfor-\nmance, this papers tests if agility is a mediator of knowledge assets\non performance (process-level performance and competitive\nadvantage). The study explores the following three research ques-\ntions (RQs):\nRQ1 –What are the BDA enablers for the creation of organizational\nagility?RQ2 –What are the impacts of this dynamic capability created by\nBDA on sustainable competitive advantage?\nRQ3 –Is agility a mediator of knowledge assets on performance\n(process-level performance and competitive advantage)?Journal of Business Research 70 (2017) 379 –390\n☆The author is grateful for the comments by anonymous reviewers, on earlier drafts of\nthis article.\n⁎Corresponding author.\nE-mail address: nreal@novaims.unl.pt (N. Côrte-Real).\nhttp://dx.doi.org/10.1016/j.jbusres.2016.08.011\n0148-2963/© 2016 Elsevier Inc. All rights reserved.\nContents lists available at ScienceDirect\nJournal of Business Research\nThis study offers guidance for executives and managers to assess the\nconditions under which BDA can add business value to organizations.\nManagers and IT executives can bene fit from an evaluation instrument\nto assess the impact of BDA. Also, this paper provides valuable support\nto justify BDA investments and initiatives. Firms that have not yet\ndecided to adopt these technologies can obtain a view of potential\ngains from adopting and effectively using BDA. This research demon-strates how best to leverage the knowledge embedded in BDA systems,\nacquiring organizational agility capabilities that lead toward competi-\ntive advantage.\nThe remainder of this paper has the following structure: Section 2\nprovides an introduction to the BDA concept and a theoretical\nbackground to assess BDA initiatives; Section 3 presents the conceptual\nmodel and the hypotheses; Section 4 outlines the methodology; and\nSection 5 shows the empirical results. Finally, the paper presents a\ndiscussion and the conclusions from the findings.\n2. Background2.1. Big Data Analytics\nChen, Chiang ( Chen, Chiang, & Storey, 2012 ) coined the term Big\nData Analytics (BDA) as a related field of business intelligence &\nanalytics (BI&A), referring to the BI&A technologies that mostly concern\ndata mining and statistical analysis. Authors de fine BDA as “an e w\ngeneration of technologies and architectures, designed to economically\nextract value from very large volumes of a wide variety of data, by enabling\nhigh velocity capture, discovery and/or analysis. ”(IDC, 2011 ). BDA tech-\nnologies allow firms to improve existing applications by offering\nbusiness-centric practices and methodologies that provide a competi-\ntive advantage ( Chen et al., 2012; Davenport, 2006 ). The latest literature\nindicates that there is much room for further BDA research ( Abbasi\net al., 2016; Agarwal & Dhar, 2014; Erevelles, Fukawa, & Swayne,\n2016 ). There are already academic studies that re flect the adoption\nand use of BDA (e.g., ( Malladi, 2013; Xu, Frankwick, & Ramirez, 2016;\nKwon, Lee, & Shin, 2014 )). Regarding value, most BDA academic studies\nfocus on analyzing business value from a data or system perspective\n(e.g., ( LaValle et al., 2011; Kwon et al., 2014 )). From the strategic\nmanagement perspective only one conceptual paper explores how\nBDA affects several marketing activities ( Erevelles et al., 2016 ). The\nremaining literature addresses industry primarily ( LaValle et al., 2011;\nRussom, 2011 ). As firms do not know how to capture business value\n(Barton, 2012; LaValle et al., 2011 ), some scholars ( Corte Real et al.,\n2014; Malladi, 2013 ) argue that BDA value research is scarce and\nneeds to extend beyond post-adoption stages toward competitiveness\n(Erevelles et al., 2016; Xu et al., 2016 ). Although numerous approaches\nassess IT Value at the process and firm levels (see Schryen ( Schryen,\n2013 ) for a review), this study extends IT business value research\nfrom the strategic management perspective, by empirically assessing\nthe BDA business value chain in European firms.\n2.2. Theoretical foundation\nMany studies in recent decades investigate IT business value and\ncompetitive advantage using the resource-based view (RBV) ( Barua,\nKriebel, & Mukhopadhyay, 1995; Bharadwaj, 2000; Mata, Fuerst, &\nBarney, 1995; Melville, Kraemer, & Gurbaxani, 2004; Ruivo, Oliveira, &\nNeto, 2015; Soh & Markus, 1995; Zhu & Kraemer, 2005 ). The limitations\nof RBV encourage the use of other theories such as DC and KBV ( Arend &\nBromiley, 2009; Wang & Ahmed, 2007 ). As DC theory constitutes the\nsecond foundation that supports knowledge-based thinking ( Pettigrew,\nThomas, & Whittington, 2001 ), this study combines these theories. KBV\nexplores a firm's potential to acquire competitiveness in a dynamic\nmarket context, but only DC theory can solve the problem of sustaining\ncompetitive advantage in turbulent environments ( Grant, 1996;\nVolberda, 1996 ).2.2.1. Knowledge Based View theory\nKBV states that a firm's knowledge resources are unique and\ninimitable and that the firm's primary function is to leverage them\ninto productive outcomes ( Grant, 1996; Nonaka, 1995 ). The possession\nof knowledge resources gives the firm basic foundations to renew or re-\nconfigure its resource base and to build dynamic capabilities ( Wu,\n2006 ), such as organizational agility. Companies that have high levels\nof staff knowledge and involvement can more skillfully identify the\nneed to make changes to existing resources and decide about the ac-\ntions necessary to implement these changes ( Nieves & Haller, 2014 ).\nKBV theory can help to conceptualize the performance effects of IT in-\nvestments ( Pavlou et al., 2005 ). Management studies use this theory\n(e.g., ( Nieves & Haller, 2014 )), as do studies in IT fields (e.g., ( Sher &\nLee, 2004 )) to understand the role of knowledge management in the\ncreation of DC. In BDA technologies, Xu, Frankwick ( Xu et al., 2016 )\nseek to understand the relationships among traditional marketing\nanalytics, BDA, and new product success. The current paper is the first\nthat empirically tests KBV to understand the role of BDA in the creation\nof agility.\n2.2.2. Dynamic capability theory\nIn the past decade the DC perspective arose as one of the most\neffective theoretical lenses for the strategic management field\n(Schilke, 2014 ), attracting the interest of scholars not only in business,\nbut also in the IT management field ( Helfat et al., 2009; Protogerou,\nCaloghirou, & Lioukas, 2012 ). Rooted in RBV and KBV, DC argues that\nthe dynamic capabilities enable firms to modify their resource to\nadapt rapidly to changing conditions, helping them to sustain their\ncompetitive advantage over time ( Helfat & Peteraf, 2009; Teece,\nPisano, & Shuen, 1997 ). Although the literature has a broad range of\ndefinitions for DC, one of the seminal papers de fines DC as “the ability\nto integrate, build, and recon figure internal and external competencies to\naddress rapidly-changing environments ”(Teece et al., 1997 ). DC\ndisaggregates into “the capacity (1) to sense and shape opportunities\nand threats, (2) to seize opportunities, and (3) to maintain competitive-ness through enhancing, combining, protecting, and, when necessary,\nrecon figuring the business enterprise's intangible and tangible assets ”.\nSome authors argue that agility is an organizational dynamic\ncapability ( Blome, Schoenherr, & Rexhausen, 2013; Sambamurthy\net al., 2007; Zhou & Wu, 2010 ). Teece ( Teece, 2007 )d efines agility as a\nhigher-order dynamic capability that emerges over time, generally\ndefining agility as a capability with which firms can identify and re-\nspond to environmental threats and opportunities and quickly adjust\ntheir behaviors ( Goldman, Nagel, & Preiss, 1995; Sambamurthy,\nBharadwaj, & Grover, 2003 ). This concept also relates to the operational\nflexibility of organizational processes and IT systems to support\nstructured or unstructured changes ( Chen et al., 2014 ). Achieving agility\ndemands processing a large and varied amount of information\n(Goldman et al., 1995 ). This process is possible with BDA applications.\nHowever, like IT applications ( Sambamurthy et al., 2003; Weill,\nSubramani, & Broadbent, 2002 ), BDA tools cannot automatically\nimprove agility. In fact, under certain conditions BDA tools can impede\nagility ( Chen et al., 2014 ). For this reason, the need exists to understand\nhow BDA applications can create agility.\nSeveral recent studies in the business management field apply DC\ntheory to measure the in fluence of DC in the creation of competitive ad-\nvantages (e.g., Schilke, 2014; Zott, 2003; Drnevich & Kriauciunas, 2011 ).\nIn the IT management field, few empirical studies use this theory.\nAnalyzing the IT in fluence on DC generically, ( Chen et al., 2014; Sher\n& Lee, 2004 ), researchers conclude that IT is an enabler of DC in\norganizations. Regarding agility, several studies assess the impact of IT\non organizational agility (e.g., Sambamurthy et al., 2007; Chen et al.,\n2014; Cai et al., 2013; Tallon & Pinsonneault, 2011; Liu et al., 2013; Lu\n& Ramamurthy, 2011 ). These studies demonstrate a positive relation-\nship between IT and agility. Chen ( Chen et al., 2014 ) recently concludes\nthat the IT business value essentially depends on how agile a firm is380 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nwith regard to managing business processes. Although the literature\naddresses the impact of IT on the creation of organizational agility, no\nstudy links BDA with this speci fic DC. Apart from some qualitative stud-\nies in the area of business analytics (BA) ( Shanks & Bekmamedova,\n2013; Shanks & Sharma, 2011 ), only conceptual papers use DC theory\nto study BDA value ( Corte Real et al., 2014; Erevelles et al., 2016 ).\nFirms that do not develop the resources and capabilities to use BDA\napplications will struggle to develop a sustainable competitive advan-\ntage ( Erevelles et al., 2016 ). Given that agility is vital for companies´\nsurvival, and that BDA can support organizational business processes,\nthis study fills this academic gap and links the two concepts empirically.\n3. Conceptual model\nWith recourse to the two strategic management theories (KBV and\nDC) discussed above, this section explains the conceptual model and\nthe speci fic hypotheses ( Fig. 1 ).\nRooted in an earlier conceptual model ( Corte Real et al., 2014 ), this\nresearch model empirically tests 12 propositions. The study assesses\nthe entire value chain starting with how BDA can leverage different\nforms of knowledge to create organizational agility ( H1,H2,H3). BDA\ntechnologies can provide organizational agility to the firm by using\neffective knowledge management. Firms owning this type of dynamic\ncapability can achieve competitive advantage directly ( H4a)o ri n d i r e c t -\nly through business processes ( H4b). Results obtained by using business\nprocesses will impact the overall organization ( H5). Agility can also\nmediate the relationship between knowledge assets and performance\n(H6a,b,c-H7a,b,c). BDA uses some controls such as country, industry,\ntechnological turbulence, and time.\n3.1. Hypothesis3.1.1. Knowledge assets\nOrganizational knowledge such as operational routines, skills, and\nknow-how constitutes a key source of competitiveness ( Grant, 1996 ).\nKnowledge management plays a critical role in pro ficiently managing\ndata and delivering it to the end users to support business processes\n(Rajpathak & Narsingpurkar, 2013 ). Knowledge management repre-\nsents a dimension supported by KBV ( Ruggles, 1998 ) and enables\ndynamic capabilities by offering speci fic functional competences that\ncan improve business performance ( Teece et al., 1997 ). A naturalrelationship exists between KM and BDA. Both deal with intangible\nassets such as data, knowledge, and intelligence ( Erickson & Rothberg,\n2015 ). BDA is a source of knowledge management, allowing firms to\nadd value primarily at the beginning of the information value chain\nand helping knowledge to flow to achieve business excellence ( Chau\n& Xu, 2012; Popovi čet al., 2012 ).\nBig data is a potential knowledge asset, contingent upon the proper\nuse of that knowledge ( Erickson & Rothberg, 2015\n). BDA represents\ntechnologies drivers of a strategic knowledge asset (big data). BDA\napplications have the potential to add value by providing more\ntransparent and accurate results to support decision-making in several\nbusiness areas ( Manyika et al., 2011a ).\nBDA strategy requires the capacity to sense, acquire, process, store,\nand analyze the data and convert that data into knowledge ( Rajpathak\n& Narsingpurkar, 2013 ). Several empirical studies state that the knowl-\nedge processes are antecedent dimensions of successful DC, by allowing\nfirms to continually renew their knowledge base and deliver business\nperformance ( Ambrosini & Bowman, 2009; Sher & Lee, 2004; Zheng,\nZhang, & Du, 2011 ). As DC are information-intensive ( Pavlou & El\nSawy, 2011 ), BDA may help in the creation of DC and organizational\nagility speci fically. Using BDA technologies helps to store and share\nknowledge, thereby allowing for an improvement of organizational\nknowledge by promoting ef ficiency within an organization, particularly\nby data integration and the use of analytical tools ( Russom, 2011 ). Some\nauthors argue that firms must combine endogenous and exogenous\nknowledge to achieve DC ( Sher & Lee, 2004 ). Zhao ( Cai et al., 2013 )\nargues that IT capability and KM capability are important in fostering\norganizational agility. Agility is promoted through knowledge manage-\nment by improving innovative responses, and can improve through the\nuse of IT and automated business processes ( Cai et al., 2013 ). In the\nsame way, organizations should be able to use BDA technologies to\nconvert knowledge into new routines and enhance organizational\nagility. Based on these findings, the hypotheses are:\nH1. BDA technologies allow an effective endogenous knowledge\nmanagement that positively in fluences dynamic capabilities such as\norganizational agility.\nH2. BDA technologies allow an effective exogenous knowledge\nmanagement that positively in fluences dynamic capabilities such as or-\nganizational agility.\nFig. 1. Proposed conceptual model.381 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nKnowledge sharing with key channel partners refers to the extent to\nwhich a firm shares insights and know-how about its business context\nwith its partners ( Saraf, Langdon, & Gosain, 2007 ). Channel partners\nare considered to be tactically and strategically important for\ncompanies. They can help to collect crucial market-related information\nwith which to fine tune the strategy to meet customer needs, resulting\nin long-term financial performance ( Lorenzoni & Lipparini, 1999 ).\nLiterature points out that the collaborative knowledge sharing capacity\nprovides an opportunity to increase value (e.g.,( Saraf et al., 2007 )) and\nenable DC (e.g., ( Della Corte & Del Gaudio, 2012 )). Considering that\nDC theory encompasses several levels of analysis, it is important to\nconsider the relational view, including the ability to collaborate with\nchannel partners ( Teece, 2007 ). Literature shows that agility needs the\nsupport of effective knowledge sharing ( Liu, Song, & Cai, 2014 ). Some\nstudies link the knowledge sharing capability through IT with agility\n(e.g., ( Cai et al., 2013; Liu et al., 2014 )). Such interactions can also\nbenefit from the use of BDA technologies, consequently enhancing\norganizational agility by in fluencing the capabilities to sense opportuni-\nties and threats, shape them, and seize them ( Della Corte & Del Gaudio,\n2012 ). Therefore, another hypothesis is:\nH3. BDA technologies allow an effective knowledge sharing with\npartners that positively in fluences organizational dynamic capabilities\nsuch as organizational agility.\n3.1.2. Organizational agility\nDC can play a key role in determining a firm's competitive advantage\n(Teece et al., 1997; Zott, 2003 ). Agility is the “capacity of an organization\nto efficiently and effectively redeploy/redirect its resources to value cre-\nating and value protecting (and capturing) higher-yield activities as in-\nternal and external circumstances warrant ”(Teece, Peteraf, & Leih,\n2016 ). In the management field several researchers recognize that DC\ndoes not lead directly to sustainable competitiveness, and that this\nvalue derives from improved business processes (e.g., ( Schilke, 2014;\nDrnevich & Kriauciunas, 2011 )). Some authors conclude that agility\ncan in fluence organizational performance ( Cai et al., 2013; Liu et al.,\n2013; Tallon & Pinsonneault, 2011 ). Hence, additional hypotheses are:\nH4a. Organizational agility is a dynamic capability leveraged by BDA\nthat positively affects the creation of competitive advantages.\nH4b. Organizational agility is a dynamic capability leveraged by BDA\nthat positively in fluences the process-level performance.\nBy engaging the business activities (e.g., sense customer needs, mar-\nket research, R&D) companies can increase the possibility of achieving\nprocess innovation success ( Zollo & Winter, 2002 ). In the IT field some\nauthors focus on the importance of assessing how business processes\ncan bring value to firms (e.g., ( Chen et al., 2014; Tallon, 2007 )). Recent\nconceptual considerations are that BDA is a source of DC (organizational\nagility, speci fically) and that BDA are a way to provide business value to\nfirms ( Erevelles et al., 2016 ). Therefore, the hypothesis is:\nH5. Process-level performance has a positive effect on competitive\nadvantage.\n3.1.3. The mediating role of agility on the relationship between knowledge\nassets and performance\nEarlier IT literature considers that dynamic capabilities can establish\na link between knowledge assets and firm performance ( Sher & Lee,\n2004; Wang, Klein, & Jiang, 2007 ). In the management field some\nauthors examine agility as a mediator between the management of\nknowledge assets and performance ( Chung, 2010; Liu et al., 2014 ).\nAlso, the proposed model suggests a potential mediating role of agility\nin the relationship between knowledge assets and two types ofperformance (process-level performance and competitive advantage).\nThus, additional hypotheses are:\nH6a. Agility positively mediates the relationship between endogenous\nknowledge management and competitive advantage.\nH6b. Agility positively mediates the relationship between exogenous\nknowledge management and competitive advantage.\nH6c. Agility positively mediates the relationship between knowledge\nsharing with partners and competitive advantage.\nH7a. Agility positively mediates the relationship between endogenous\nknowledge management and process-level performance.\nH7b. Agility positively mediates the relationship between exogenous\nknowledge management and process-level performance.\nH7c. Agility positively mediates the relationship between knowledge\nsharing with partners and process-level performance.\n3.1.4. Competitive advantage\nCompetitive advantage exists when a firm reveals having greater\nsuccess compared with its current or potential competitors ( Peteraf &\nBarney, 2003 ). To be consistent with this conceptualization, superior\nfirm performance relative to that of competitors constitutes an empiri-\ncal and common indicator of competitive advantage. ( Barnett, Greve, &\nPark, 1994; Schilke, 2014 ). Based on Schilke's construct ( Schilke, 2014 ),\ncompetitive advantage was operationalized as re flective-re flective type\n(Ringle, Sarstedt, & Straub, 2012 ), with the first-order dimensions of:\n(1) strategic performance (qualitative dimension) and (2) financial per-\nformance (quantitative dimension), both in comparison to competition.\n3.1.5. Controls\nAs literature widely supports, this study uses the industry and the\ncountry in which a firm competes as predictors of competitiveness\n(Schilke, 2014 ). BDA may be particularly useful to firms operating in\nturbulent technological environments ( Wade & Hulland, 2004 ), and\nconsequently, following the approach of Menguc and Auh ( Menguc &\nAuh, 2006 ) and Drnevich and Kriauciunas ( Drnevich & Kriauciunas,\n2011 ), the study includes turbulent technological environment as a con-\ntrol. A turbulent technological environment makes current technology\nobsolete and requires the development of new advances ( Menguc &\nAuh, 2006 ). Finally, we use the variable “time since adoption of BDA ”\nto control for the knowledge and experience that organizations gain\nby using BDA over time ( Elbashir et al., 2013 ). These controls explain\nall dependent variables (agility, process-level performance, and\ncompetitive advantage).\n4. Research design\n4.1. Measurement\nTo test the model ( Fig. 1 ) and the related hypotheses, the study per-\nforms a multi-country survey of European organizations from several\nindustries. Following the recommendations of Moore and Benbasat\n(Moore & Benbasat, 1991 ), the study uses a survey instrument drawing\nupon a comprehensive literature review. Regarding content validity,\nfive established academic IS researchers and two language experts\nreview each item on the questionnaire, assessing its content, scope,\nand purpose ( Brislin, 1970 ). To test the dif ficulty of the questions, to-\ngether with the reliability and validity of the scales, a pilot study uses\na sample of 30 executives from firms not part of the main survey.\nRemoval of some items reduces ambiguity and simpli fies interpretation.\nThe survey instrument and measurement items are in Appendix A.382 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\n4.2. Data\nThe survey was conducted in 2015 using an online survey tool. To\nguarantee the quality of the data, the respondent pro file uses the\nfollowing three criteria: deep knowledge of the organization strategy,\nmore than five years of experience in BI&A/BDA initiatives, and holding\nan IT/business executive or management position in the company. Themailing database comes from Dun & Bradstreet, one of the world's lead-\ningfirms for commercial information and business insight. The initial\nsample of 500 firm executives from European firms receives an email\nto participate in the survey.\nNinety-two valid responses were received in the first month. To\nincrease the response rate a follow-up email was sent. During the\nfollowing months 83 additional valid responses were received from\nlate responders, totaling 175 usable responses (overall response rate\nof 35%). As seen in Table 1 , the sample comprises different industries\nof which almost half are financial firms (40.5%). Regarding firm size,\nthe sample is equally distributed between mid-size and large compa-\nnies. Business (41.4%) and IT executives (58.6%) are well represented.\nNon-response bias was assessed using the sample distributions of the\nearly and late respondent groups compared with the Kolmogorov-\nSmirnov test ( Ryans, 1974 )( s e e Table 2 ). The early respondents were\nidenti fied by selecting the respondents in the first month. The test\nshows that the two groups do not differ statistically (5% signi ficance\nlevel, pN0.05), demonstrating the absence of non-response bias\n(Ryans, 1974 ). Due to the fact that the study collects data simultaneous-\nly from a single source, for the sake of validity, common method bias\nneeds to be assessed. The study uses Harman's post hoc single-factor\nanalysis for this purpose. A factorial analysis of all indicators was con-\nducted and the first extracted factors explain 36.9% of variance. This\nmeans that common method bias is unlikely to be an issue in the data\nPodsakoff et al., 2003 .\n5. Results\nTo estimate the conceptual model, the study uses the partial least\nsquares (PLS) method ( Hair, Ringle, & Sarstedt, 2011 ). PLS ful fills theresearch purpose by examining the validity of the constructs, without\nrequiring normal distributions for the variables. PLS requires a sample\nsize of ten times the number of the largest number of structural paths\ndirected at a particular construct ( Gefen & Straub, 2005 ). In the\nconceptual model the largest number of structural paths directed to a\nparticular construct is three, which means that the minimum sample\nsize should be 30. The sample is larger ( n=1 7 5 ) ,m e a n i n gt h a ti ti sa d -\nequate for PLS. Before testing the structural model, the study analyzes\nthe measurement model in order to assess reliability and validity.\n5.1. Measurement model\nThe study examines indicator reliability, construct reliability, con-\nvergent validity, and discriminant validity in order to assess the mea-\nsurement model. Tables 3 and 4 show the results of the measurement\nmodel. Regarding indicator reliability, only loadings above 0.7 were\nconsidered. Hence, four items (ENKM5, DC1, PLP3-4) were eliminated.\nAsTable 3 reveals, the instrument presents good indicator reliability,\nas the loadings are above 0.70. The composite reliability coef fi\ncient as-\nsesses the construct reliability because construct reliability takes into\nconsideration indicators having different loadings ( Hair et al., 2011;\nHenseler, Ringle, & Sinkovics, 2009 ).Table 4 shows that all constructs\nhave composite reliability above 0.7, which suggests that the constructs\nare reliable. To test convergent validity, the study uses average variance\nextracted (AVE). The AVE should be higher than 0.5, (i.e., the latent var-\niable explains more than half of the variance of its indicators ( Henseler\net al., 2009; Fornell & Larcker, 1981 )).Table 4 shows that all constructs\nmeet this criterion. Regarding discriminant validity, the study uses two\nmeasures: the Fornell-Larcker criterion and cross-loadings. First, ac-\ncording to Fornell and Larcker ( Fornell & Larcker, 1981 ), the square\nroot of AVE should be greater than the correlations with other latent\nvariables. Table 4 shows that the square roots of AVEs (in bold) are\nhigher than the correlation between constructs. All the constructs\nshow evidence of acceptable discrimination. Second, the loading of\neach indicator should be greater than all cross-loadings ( Chin, 1998a )\n(see Table 3 ). Overall, the model has good indicator reliability, construct\nreliability, convergent validity, and discriminant validity. As these\ncriteria are met, the constructs can test the structural model.\n5.2. Structured model\nTo evaluate the structured model, we followed Hair's five-step\napproach ( Hair et al., 2013 ): (1) collinearity assessment, (2) structural\nmodel path coef ficients, (3) coef ficient of determination (R2value),\n(4) effect size f2,a n d( 5 )p r e d i c t i v er e l e v a n c eQ2and blindfolding.\nRegarding collinearity (1), the results suggest minimal collinearity\namong the constructs (the highest VIF among the explanatory variables\nis 2.95), which means the predictors in the structural model do not\nsuffer from this issue. To empirically assess the hypotheses postulated\ninSection 3 , the study examines the level of signi ficance in pathTable 1\nSample pro file.\nSample characteristics (n = 175) Obs. (%)\nRespondent position\nIT executive\nChief Information Of ficer (CIO) 22 12.5%\nIT Director 26 14.8%IT Manager 32 18.2%Other IT executive 23 13.1%\nBusiness executive\nChief Financial Of ficer (CFO) 19 10.9%\nBusiness Manager - Strategic Planning 18 10.3%\nCentral Operations Of ficer (COO) 14 8.0%\nOther Business executive 21 12.0%\nNo. of employees\nb50 14 8.0%\n50–250 76 43.4%\nN250 85 48.5%\nIndustry\nManufacturing 23 13.1%Electricity, gas and water supply activities 11 6.2%Wholesale and retail trade 19 10.8%Transports and telecommunications 18 10.2%Financial intermediation 71 40.5%Others 33 18.8%\nNotes: (1) The firm size is categorised based on European enterprises size classi fication\n[104]; (2) The industries of activity are in accordance with NACE (European standard clas-sification of productive economic activities).Table 2\nTesting possible response bias: early vs. late respondents.\nConstructs Full sample\nN = 175Early\nrespondentsN=9 2Late\nrespondentsN=8 3Kolmogorov-\nSmirnov test\nMean S.D. Mean S.D. Mean S.D. p-Value\nENKM 5.9 0.71 5.9 0.67 5.9 0.75 0.65\nEXKM 5.8 0.86 5.9 0.85 5.7 0.86 0.07\nKSP 4.8 0.89 4.8 0.80 4.7 0.98 0.30AG 6.1 0.93 6.1 0.78 6.0 1.07 0.72PLP 6.1 0.81 6.1 0.78 6.0 0.83 0.23CA 5.9 0.82 6.0 0.72 5.8 0.92 0.34SP 6.0 0.81 6.0 0.72 6.0 0.89 0.76FP 5.9 0.96 6.0 0.81 5.7 1.09 0.16383 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\ncoefficients (2) by means of a bootstrapping technique ( Hair et al.,\n2011; Henseler et al., 2009 ) with 5000 iterations of re-sampling, with\neach bootstrap sample constituted by the number of observations\n(i.e., 175 cases). To have more conservative outcomes, the study uses\ntheno sign change option ( Hair et al., 2013 ).Fig. 2 shows the estimated\nmodel (path coef ficients, R2and Q2), and Table 5 summarizes the\nresults. Concerning R2values (3), all dependent variables present rea-\nsonable values. In addition, this study calculates the f2and q2effect\nsizes (4). Most of the values of f2effect size are small, with the exception\nof agility in process-level-performance and exogenous knowledge\nmanagement in agility (moderate effects). Last, based on a blindfolding\nprocedure, all Q2values are above zero, which means the model has\npredictive power concerning the dependent variables (see Fig. 2 ).\nFig. 2 summarizes the analysis results as follows: the conceptual\nmodel explains 61.8% of the variation in organizational agility. Endoge-\nnous Knowledge Management (EnKM) ( ^β= 0.155; pb0.01) and Exog-\nenous Knowledge Management (ExKM) ( ^β= 0.248; pb0.001) are\nstatistically signi ficant in explaining organizational agility (AG). Thus,\nH1 and H2 are con firmed, whereas knowledge sharing partners (KSP)\n(H3) is not con firmed. Organizational agility (AG) ( ^β= 0.371;\npb0.001) is statistically signi ficant in explaining Process-level Perfor-\nmance (PLP), and consequently H4b is supported. The conceptual\nmodel explains 57.8% of the variation in Process-level Performance\n(PLP). Agility (AG) contributes signi ficantly to explain performance attwo levels: Process-level Performance (PLP) ( ^β= 0.371; p b0.001)\nand Competitive Advantage (CA) ( ^β=0 . 2 0 4 ;p b0.01), which con firms\nH4a and H4b .H5is not supported, as the effect is statistically not signif-\nicant (PLP- NCA). The conceptual model explains 77.8% of the variation\nin Competitive Advantage (CA). The conceptual model substantially ex-\nplains the variation of all three dependent variables ( Chin, 1998b;\nHenseler et al., 2009 ).\n5.3. Mediating effect testing\nBased on the guidelines of Hair ( Hair et al., 2013 ), Preacher\n(Preacher & Hayes, 2008 ), and Nitzl ( Nitzl, Roldán, & Cepeda,\n2016 ), the study evaluates the signi ficance of the mediating effects\nof organizational agility. Mediation analysis is eligible if the indirect\neffect is signi ficant. Table 6 presents the results, which ful fill the nec-\nessary conditions to perform the mediator assessment. Also, the\nstudy calculates variance accounted for (VAF) to determine the size\nof the indirect effect in relation to the total effect ( Hair et al.,\n2013 ). The results show that agility can partially mediate the\nrelationship between knowledge assets (endogenous and exogenous\nknowledge) and performance (process-level performance andcompetitive advantage), thereby supporting H6a,b and H7a,b. No\nmediating effects were found between knowledge sharing withTable 3\nLoadings and cross-loadings for the measurement model.\nConstruct Item ENKM EXKM KSP AG PLP FP SP\nEndogenous knowledge management ENKM1 0.715 0.171 0.270 0.264 0.240 0.266 0.180\nENKM2 0.796 0.092 0.393 0.184 0.094 0.331 0.190\nENKM3 0.915 0.317 0.294 0.450 0.322 0.476 0.371\nENKM4 0.826 0.313 0.135 0.374 0.331 0.508 0.365\nExogenous knowledge management EXKM1 0.086 0.797 -0.183 0.390 0.365 0.328 0.345\nEXKM2 0.214 0.899 -0.136 0.495 0.477 0.446 0.403\nEXKM3 0.397 0.775 0.057 0.444 0.636 0.515 0.434\nKnowledge sharing partners KSP1 0.383 −0.012 0.873 −0.125 −0.140 −0.167 −0.156\nKSP2 0.324 −0.058 0.939 −0.145 −0.185 −0.116 −0.192\nKSP3 0.210 −0.140 0.960 −0.245 −0.276 −0.199 −0.300\nAgility AG2 0.395 0.453 −0.182 0.860 0.576 0.586 0.729\nAG3 0.397 0.482 −0.189 0.931 0.604 0.619 0.665\nAG4 0.402 0.538 −0.085 0.905 0.608 0.607 0.627\nAG5 0.327 0.494 −0.263 0.928 0.590 0.640 0.682\nPerformance at process level PLP1 0.315 0.629 −0.231 0.676 0.951 0.571 0.563\nPLP2 0.308 0.533 −0.204 0.558 0.939 0.525 0.552\nCompetitive advantage Financial performance FP1 0.445 0.501 −0.238 0.675 0.571 0.950 0.728\nFP2 0.531 0.496 −0.071 0.594 0.487 0.949 0.665\nFP3 0.477 0.518 −0.199 0.657 0.594 0.950 0.704\nStrategic performance SP1 0.343 0.363 −0.134 0.615 0.507 0.584 0.840\nSP2 0.327 0.445 −0.298 0.683 0.499 0.719 0.932\nSP3 0.321 0.485 −0.230 0.715 0.590 0.681 0.927\nThefigures in bold represents the cross-loadings for the measurement model.\nTable 4\nCorrelation matrix, composite reliability (CR), and square root of AVEs.\nCR ENKM EXKM KSP AG PLP FP SP\nEndogenous knowledge management (ENKM) 0.89 0.82\nExogenous knowledge management (EXKM) 0.87 0.30 0.83\nKnowledge Sharing with Partners (KSP) 0.95 0.31 −0.09 0.93\nAgility (AG) 0.95 0.42 0.54 −0.20 0.91\nProcess level performance (PLP) 0.94 0.33 0.62 −0.23 0.66 0.95\nFinancial performance (FP) 0.97 0.51 0.54 −0.18 0.68 0.58 0.95\nStrategic performance (SP) 0.93 0.37 0.49 −0.25 0.75 0.59 0.74 0.90\n(1) First column are CR (composite reliability).\n(2) Diagonal elements are square root of average variance extracted (AVE).(3) Off-diagonal elements are correlations.The bold figures represent the square roots of AVEs.384 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\npartners and performance (process-level performance and competi-\ntive advantage), which means H6c and H7c are not con firmed.\n6. Discussion\nAs BDA can generate value in several ways, the need exists to under-\nstand the entire chain. This study fills the research gap by assessing not\nonly the antecedents but also the effects of BDA initiatives in European\nfirms.\nThe results strongly support the claim that BDA applications can\nallow an effective internal and external knowledge management which\ncan help firms to create organizational agility. This agility exists in several\nways: (1) by sensing opportunities and threats (e.g., reacting to new\nproducts or services of competitors); (2) by seizing possible chances\n(e.g., expanding into new regional or international markets), and\n(3) by adjusting to the technological environment to attain competitive\nadvantage (e.g., adopting new technologies to produce products andservices more ef ficiently). This finding is consistent with earlier literature\n(Chen et al., 2014; Liu et al., 2014; Sher & Lee, 2004 ).\nRegarding the antecedents, the results demonstrate that BDA can\nsupport organizational knowledge management, allowing the crea-\ntion/enhancement of dynamic capabilities such as organizational agility.\nThisfinding is consistent with earlier studies applied to IT innovations\nand organizational management (e.g., ( Nieves & Haller, 2014; Sher &\nLee, 2004; Cai et al., 2013; Liu et al., 2014; Cepeda & Vera, 2007 )). The\nresults suggest that exogenous knowledge management deserves\nmore attention, which was considered more important than endoge-\nnous knowledge management. This outcome suggests that BDA\ntechnologies can provide business value by facilitating the acquisition\nof supply chain and marketing knowledge. While knowledge manage-\nment is important to explain BDA value creation, the way of sharing\nthis strategic asset among business partners is not statistically signi fi-\ncant in this study. Although the hypothesis related to the knowledge\nshared with partners ( H3) seems plausible and consistent with earlier\nstudies for other IT innovations (e.g., ( Zhu & Kraemer, 2005; Zheng\nFig. 2. Estimated model. Note: ns = non-signi ficant. ** |t| N=1.96 at p = 0.05; *** |t| N=2 . 5 7a tp=0 . 0 1l e v e l ;* * * *| t | N=3.29 at p = 0.001 level.\nTable 5\nSignificant testing results of the structural model path coef ficients.\nStructural path Path coef ficient (t-value) Effect size (f2) Effect size (q2) 95% con fidence interval Conclusion\nEndKM →AG 0.155⁎⁎\n(2.562)0.038 0.024 [0.032; 0.268] H1supported\nExKM→AG 0.248 ⁎⁎⁎⁎\n(4.556)0.120 0.074 [0.149; 0.364] H2supported\nKSP→AG 0.010 ns\n(0.121)0.000 0.000 [ −0.145; 0.169] H3not supported\nAG→CA 0.204 ⁎⁎⁎\n(2.786)0.064 0.021 [0.065; 0.351] H4a supported\nAG→PLP 0.371⁎⁎⁎⁎\n(3.969)0.125 0.080 [0.173; 0.544] H4b supported\nPLP→CA 0.106 ns\n(1.579)0.021 0.007 [ −0.030; 0.234] H5not supported\nNote: ns = non-signi ficant.\nThe values of f2and q2effects can be considered weak (0.02). moderate (0.15) and strong (0.35).\nConfidence level:\n⁎⁎|t|N=1.96 at p= 0.05 level.\n⁎⁎⁎|t|N=2 . 5 7a t p=0 . 0 1l e v e l .\n⁎⁎⁎⁎ |t|N=3.29 at p= 0.001 level.385 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\net al., 2011; Ruivo, Oliveira, & Neto, 2014 )), this construct does not con-\ntribute to creating valuable organizational agility. An earlier study con-\ncludes that using this type of knowledge is not always useful and can\nharm speci fic business processes in some situations. Moreover, this\nstudy shows that agility can partially mediate the positive effect of\nsome knowledge assets (exogenous and endogenous) and performance\n(process-level performance and competitive advantage) ( H6a,H6b and\nH7a,H6b). This finding is consistent with earlier studies ( Liu et al., 2013;\nLiu et al., 2014; Pavlou & El Sawy, 2006 ).\nCompetitive performance is not only about how much firms know,\nbut how they use what they know ( Haas & Hansen, 2005 ). A possible\nexplanation for this result is that firms are reluctant to share sensitive\ninformation that might compromise their competitive advantage. In\nfact, synergies with business partners can be bene ficial (e.g.,( Setia,\nRichardson, & Smith, 2015 )), but careful attention is needed regarding\nthe shared information. The study shows that knowledge sharing with\npartners can be truly compromising in the areas of Production and Op-\nerations or Product and Service enhancement, which represent the core\nbusiness practices of a firm. An information sharing agreement might be\na solution to overcome this constraint.\nConcerning the effects of agility leveraged by BDA, the results indi-\ncate that this dynamic capability can positively impact competitive ad-\nvantage in different ways (via processes or organizationally), which is\nin line with the findings of other authors ( Drnevich & Kriauciunas,\n2011; Protogerou et al., 2012 )(H4a,b). Agility can also be more effective\nin improving speci fic business processes than organizational perfor-\nmance, which is consistent with Drnevich and Kriauciunas ( Drnevich\n& Kriauciunas, 2011 ).The results demonstrate that no signi ficant link\nexists between process-level performance and competitive advantage\n(H5). In this sense, Drnevich and Kriauciunas ( Drnevich & Kriauciunas,\n2011 ) argue that a firm's performance depends on a set of elements\nthat might fail due to miscommunication between the business areas\nand the top management. Although some business areas can behave\nin an ef ficient way, this ef ficiency does not necessarily have a signi ficant\neffect on the overall performance.\nAlthough BDA technologies are generaly associated with customer\nmanagement or marketing areas, results indicate that, in general,\nEuropean firms focus more on internally improving their assets\n(products and services) and the way that these are being produced to\noptimize costs. With Europe still showing signs of financial crisis, this\nfinding might point the way to a change of survival strategy in compet-\nitive markets.\n6.1. Limitations and further research\nCertain limitations apply to the interpretation of the results of this\nstudy. First, the antecedents of agility do not extend beyond the speci fic\nknowledge resources included in the model. Other factors can also\ndetermine the development of this dynamic capability in European\nfirms. Future studies may include these resources as variables of themodel or by moderating existing variables. Second, although the study\nconsiders constructs in the model embedding the impact of BDA at\nprocess-level, the model is firm-level. Before generalization is possible,\nresearchers should perform a longitudinal study based on the process\napproach. Future research should use speci ficp r o c e s sc o n s t r u c t st o\nassess the impact of BDA on several business areas in detail. Third, due\nto the perceptual nature of the measures used, future studies should\nidentify the issues associated with cross-sectional research design.\nAlthough the use of objective measures to assess firm performance is\nimportant, in this study companies were reluctant to provide them.\nFourth, although the sample size is statistically adequate, a larger\nsample could be useful to reinforce the conclusions of this study.\nAs researchers generally accept that BDA can provide bene fits to all\nEuropean firms ( European_Commission, 2015 ) across several indus-\ntries, reinforced on a McKinsey survey ( Manyika et al., 2011b )r e p o r t s\nthat most industries in Europe have the capacity to store and manipu-\nlate big data, and consequently the potential value of using big data\nresides mainly in developed countries. Therefore, data from five\nEuropean developed countries were collected. By conducting future\nstudies in more countries and industries, which may have different per-\nceptions of BDA and diverse external contexts, the understanding of\nBDA business value could likely improve. Due to their different cultures,\nresearch to perform a comparative study among European regions\n(e.g., Northern and Southern Europe) could be interesting.\n6.2. Theoretical implications\nThis study offers two key contributions that extend theory on BDA in\ntechnology and organizational management research:\n(1)BDA value chain understanding - Despite the potential bene fits,\nsome firms fail to capture value from BDA initiatives ( Kaisler\net al., 2013 ). Recent papers focus on BDA research opportunities\n(Abbasi et al., 2016; Agarwal & Dhar, 2014 ), claiming that there is\na need to conduct assessments of the actual impact of BDA\ninvestments and use, and to understand how to achieve the\nbenefits for performance. The BDA value chain remains relatively\nunexplored and requires further investigation. The current paper\nresponds to the calls of scholars by empirically assessing the\nvalue that BDA can bring to European firms. This study theoreti-\ncally proposes and empirically validates a conceptual model\nbased on strategic management theories (KBV and DC), never\nbefore combined for this purpose, to explain the full BDA value\nchain. Liu ( L i ue ta l . ,2 0 1 4 ) argues that literature about the\nrelationship among knowledge management, organizational\nagility, and firm performance is still limited. This is the first\nstudy that empirically demonstrates that BDA applications\nbased on an effective knowledge management can help firms to\ncreate organizational agility leading to competitive advantage.\nFurther studies could bene ficially use this theoretical framework\nto assess the business value in other IT innovations at a process-Table 6\nMediation test by bootstrapping approach.\nEffect of Direct effect (t-value) Indirect effect (t-value) Total effect VAF (%) Interpretation Conclusion\nEnKM→AG→CA 0.137 ⁎⁎(2.317) 0.053 ⁎⁎(2.156) 0.190 ⁎⁎⁎⁎(3.577) 27.89% Partial mediation H6a supported\nExKM→AG→CA 0.081 ns (1.506) 0.097 ⁎⁎⁎(2.617) 0.178 ⁎⁎⁎⁎(4.037) 54.49% Partial mediation H6b supported\nKSP→AG→CA 0.026 ns (0.464) −0.014 ns (0.607) 0.012 ns (0.199) na No mediation H6cnot supported\nEnKM→AG→PLP 0.141 ⁎⁎(1.988) 0.057 ⁎⁎(2.212) 0.198 ⁎⁎⁎(2.813) 28.79% Partial mediation H7a supported\nExKM→AG→PLP 0.344⁎⁎⁎⁎(5.412) 0.092⁎⁎⁎(3.041) 0.436⁎⁎⁎(7.219) 21.10% Partial mediation H7b supported\nKSP→AG→PLP −0.157⁎⁎(2.408) 0.003 ns (0.119) −0.154⁎⁎(2.172) na No mediation H7cnot supported\nNote: VAF = variance accounted for. The VAF N80% indicates full mediation. 20% ≤VAF≥80% show partial mediation. VAF b20% indicates no mediation. ns = non-signi ficant. na = not\napplicable.\n⁎⁎|t|N=1.96 at p = 0.05 level.\n⁎⁎⁎|t|N= 2.57 at p = 0.01 level.\n⁎⁎⁎⁎ |t|N=3.29 at p = 0.001 level.386 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nlevel and firm-level. Academics can make use of this paper for\npedagogical support for teaching about BDA value chain.\n(2)DC literature –This paper contributes to DC research by empir-\nically testing agility business value in a BDA context ( Drnevich\n& Kriauciunas, 2011 ). The results strongly support the belief\nthat BDA technologies can trigger agility and that agility can af-\nfect competiveness in two ways (via processes or globally). AsBDA can signi ficantly improve business processes ( Davenport,\n2006 ), business process enhancement driven by BDA is an im-\nportant research area ( Abbasi et al., 2016 ). Earlier studies focus\nonly on the link between agility and firm performance ( Chen\net al., 2014; Liu et al., 2014; Tallon & Pinsonneault, 2011 ), while\nthis study empirically demonstrates that an effect of agility exists\nat the process-level, too. In addition, despite an increasing use of\nmediation testing, most of the studies in PLS-SEM do not analyze\nmediation effects ( Hair et al., 2013; Nitzl et al., 2016 ). Under-\nstanding mediation issues can be crucial for researchers because\nthey can better explain or hinder the in fluence of a third variable\nin the relationship between two variables in a model ( Cepeda &\nVera, 2007 ). This study demonstrates that agility can be a\nmediator between external and internal knowledge assets and\nperformance (process-level performance and competitive\nadvantage).\n6.3. Managerial implications\nFor practitioners (including executives and IT managers) this study\ndemonstrates how best to leverage the knowledge embedded in BDA\nsystems and initiatives and achieve capabilities that will help to main-\ntain competitive advantages. The paper provides support to justify\nBDA investments and initiatives. The results indicate that although\nBDA technologies call for substantial investment in implementation\nand maintenance, European firms are aware of BDA's potential value\nand bene fits. Executives should apply these guidelines to their organiza-\ntional IT strategy.\nBDA can provide value at several stages: (1) knowledge; (2) dynamic\ncapability (organizational agility); (3) business process; and (4) com-\npetitive performance. To initiate the value creation process, firms\nshould invest in an effective BDA program. First, the value that BDA\ncan provide derives first from the way firms use the technologies\navailable to manage knowledge. An effective training program can\nhelp to leverage the way users extract and manage knowledge. Second,\nby effectively using BDA, firms can acquire capabilities to innovate and\nrapidly adjust to external demands (e.g., optimize business processes).\nThird, these capabilities will encourage speci fic business areas to\ninvolve the whole organization, when an effective bottom-up strategy\nis followed, supported by good communication practices. By applying\nthis framework to BDA speci fically, managers and IT executives can\nbenefit from a performance metric that uniquely speci fies the impact\nof BDA. By evaluating the organizational knowledge conversion into\nprocess and firm-level capabilities, practitioners can increase their\nproductivity. Software vendors of BDA can also gain a better under-\nstanding of how European firms can invest and experience the value\ncreated through BDA. They can natively embed BDA capabilities in\ntheir solutions as a way for their customers to achieve superior financial\nand strategic performance. Finally, firms that have not yet decided to\nadopt these technologies can gain a perception of what is possible by\nadopting and effectively using BDA.\n6.4. Business research implications\nThe business community now sees big data as a potential tool of\nbusiness value for achieving competitive advantage. This value can\nonly be real if companies know how to effectively manage Big Data An-\nalytics (BDA) initiatives. This paper establishes a first link between BDAprocess-level performance and competitive advantage, by merging the\nfield of information systems and strategic management. By presenting\nand discussing strategic and organizational drivers and impacts of\nBDA, guidance to business researchers, practitioners, and scholars is\nprovided. As such, this paper extends knowledge by directly evaluating\nthe effect of BDA on the decision-making process to support an effective\nIT resource management, focusing on challenges for adoption, gover-nance, and evaluation.\nThe outcomes of this paper indicate that BDA can be an effective\naid to survival in competitive markets, particularly by supporting\nProduction and Operations or P roduct and Service enhancement.\nStriving to overcome damages of the financial crisis, European\nfirms are using BDA tools to internally improve their assets (products\nand services) and the way that these are being produced to optimize\ncosts. European firms tend to attribute greater value to external\nknowledge provided by BDA applications than to internal knowledge\nmanagement. Sharing knowledge with business partners is poten-\ntially harmful to organizational productivity, so careful attention is\nin order when exchanging this type of core data between companies.\nAlso, this study concludes that organizational agility leads directly to\na better performance (process-level and competitive advantage) but\ncan mediate effects from knowledge assets on performance. This\nmeans that fir m sm u s tb e a ri nm i n dt h a ts e v e r a lp a t h sc a nl e a dt o\ncompetitive advantage. First, managers should consider investing\nin BDA technologies to take advantage of internal and external\nknowledge resources. Second, by governing the knowledge extract-\ned by BDA, agility becomes the “ultimate ”organizational capability\nthat leads to sustainable compet itive advantages. Firms should\nconfidently invest in the development of agility supported by BDA\ntools.\n7. Conclusions\nAs Big Data Analytics (BDA) can offer value to companies in\nseveral ways, many scholars highlight the need to understand the\npath to competitive advantage. The main outcome emerging from\nthis paper has to do with understanding the value chain of BDA.\nGrounded on knowledge-based view (KBV) and dynamic capabilities\n(DC), this study fills a research gap from the strategic management\nperspective, by perceiving the antecedents (knowledge assets) and\nthe impacts (on process-level performance and competitive advan-\ntage) of BDA initiatives in European firms. The results show that\nthe model signi ficantly explains all dependent variables (61.8% of\nagility variation, 57.8% of process-level performance variation, and\n77.8% of competitive advantage variation). The major conclusions\nof this study are:\na) BDA can be a strategic investment for European firms to enhance or-\nganizational agility and survive in competitive markets. Firms\nshould invest in the development of organizational agility supported\nby effective BDA applications.\nb) To create agility, European firms tend to believe that the external\nknowledge deriving from BDA applications can be more effective\nin the creation of agility than internal knowledge. Sharing knowl-\nedge with business partners is problematic, as sharing, is a potential\nbarrier for process-level performance.\nc) Regarding the impacts of agility, this capability leads directly to a\nbetter performance (process-level and competitive advantage) but\ncan mediate effects from knowledge assets on performance. This\nmeans that BDA initiatives can lead to better operational ef ficiency,\nbut several paths can lead to competitive advantage.\nThus, a crucial need exists for firms to have an integrated view of the\nBDA chain in order to be able to fully leverage the innovative power of\nBDA capabilities to achieve competitive advantage.387 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nAppendix A. Survey questionnaire\nReferences\nAbbasi, A., Sarker, S., & Chiang, R. H. (2016). Big Data research in information systems: To-\nward an inclusive research agenda. Journal of the Association for Information Systems ,\n17(2), 3.\nAgarwal, R., & Dhar, V. (2014). Editorial —Big Data, data science, and analytics: The oppor-\ntunity and challenge for IS research. Information Systems Research ,25(3), 443 –448.\nAmbrosini, V., & Bowman, C. (2009). What are dynamic capabilities and are they a useful\nconstruct in strategic management? International Journal of Management Reviews ,\n11(1), 29 –49.\nArend, R., & Bromiley, P. (2009). Assessing the dynamic capabilities view: spare change,\neveryone? Strategic Organization ,7(1), 75.\nBarnett, W. P., Greve, H. R., & Park, D. Y. (1994). An evolutionary model of organizational\nperformance. Strategic Management Journal ,15(S1), 11 –28.\nBarton, D. (2012). Making advanced analytics work for you. Harvard Business Review ,90,\n78–83.\nBarua, A., Kriebel, C. H., & Mukhopadhyay, T. (1995). Information technologies and busi-\nness value: An analytic and empirical investigation. Information Systems Research ,\n6(1), 3 –23.\nBharadwaj, A. S. (2000). A resource-based perspective on information technology\ncapability and firm performance: An empirical investigation. MIS Quarterly ,24(1),\n169–196.Blome, C., Schoenherr, T., & Rexhausen, D. (2013). Antecedents and enablers of supply\nchain agility and its effect on performance: A dynamic capabilities perspective.\nInternational Journal of Production Research ,51(4), 1295 –1318.\nBrislin, R. W. (1970). Back-translation for cross-cultural research. Journal of Cross-Cultural\nPsychology ,1(3), 185 –216.\nCai, Z., et al. (2013). Developing organizational agility through IT capability and KM\ncapability. The moderating effects of organizational climate .P A C I S .\nCepeda, G., & Vera, D. (2007). Dynamic capabilities and operational capabilities: A\nknowledge management perspective. Journal of Business Research ,60(5),\n426–437.\nChau, M., & Xu, J. (2012). Business intelligence in blogs: Understanding consumer interac-\ntions and communities. MIS Quarterly ,36(4), 1189 –1216.\nChen, H., Chiang, R., & Storey, V. (2012). Business intelligence and analytics: From Big\nData to big impact. MIS Quarterly ,36(4), 1165 –1188.\nChen, Y., et al. (2014). IT capability and organizational performance: The roles of business\nprocess agility and environmental factors. European Journal of Information Systems ,\n23(3), 326 –342.\nChin, W. W. (1998a). Commentary: Issues and opinion on structural equation modeling.\nJSTOR, 7 –16.\nChin, W. W. (1998b). The partial least squares approach for structural equation modeling.\nChung, T. R. (2010). Knowledge creation and firm performance. In e. (Ed.), Mediating\nprocesses from an organizational agility perspective .A M C I S .Constructs Items Source\nKnowledge assets Please indicate the extent to which these forms of knowledge are used in your organization.\nBDA technologies:\nEndogenous knowledge\nManagementENKM1. Reduce uncertainties of knowledge loss\nENKM2. Reduce dependence on speci fic personnel\nENKM3. Are comprehensively utilized by members in organizationENKM4. Are comprehensively constructed in organization*(Sher & Lee, 2004 )\nExogenous knowledge\nManagementEXKM1. Facilitate acquisition of supply chain knowledge\nEXKM2. Facilitate processing of supply chain knowledge\nEXKM3. Facilitate processing of marketing knowledge(Sher & Lee, 2004 )\nKnowledge sharing with channel\npartnersKSP1. We frequently share knowledge about our business environment\n(e.g., other business relationships) with our channel partners.KSP2. Knowledge about all of our channel partners, competitors, etc., is shared with ourother channel partners.KSP3. Business insights are exchanged between us and our other channel partners.(Liu et al., 2014 )\nOrganizational agility (dynamic\ncapability)Please indicate the degree to which the use of BDA tools in the last three years has helped to:\nAG1. Respond to changes in aggregate consumer demand.*\nAG2. React to new product or service launches by competitors.\nAG3. Expand into new regional or international markets.AG4. Change (i.e., expand or reduce) the variety of products/services available for sale.AG5. Adopt new technologies to produce better, faster, and cheaper products and services.(Lu & Ramamurthy, 2011 )\nProcess-level performance To what extent has BDA been used to support critical business activities in each of the following\nprocesses in the last three years. A sampling of critical activities in each process is shown below.PLP1. Production and operations: improve throughout, boost labour productivity, improve flexibility\nand equipment utilisation, and streamline operations.\nPLP2. Product and service enhancement: embed IT in products, increase pace of development/R&D,\nmonitor design cost, improve quality, support innovation.PLP3. Marketing and sales: spot market trends, anticipate customer needs, build market share,improve forecast accuracy, and evaluate pricing options.*PLP4. Customer relations: respond to customer needs, provide after-sales service and support, improvedistribution, create customer loyalty*(Peteraf & Barney, 2003 )\nCompetitive advantage Please indicate the degree to which you agree with the following statements.\nStrategic Performance\nSP1. We have gained strategic advantages over our competitorsSP2. We have a large market share.SP3. Overall, we are more successful than our major competitors.Financial performanceFP1. Our EBIT (earnings before interest and taxes) is continuously above industry average.FP2. Our ROI (return on investment) is continuously above industry average.FP3. Our ROS (return on sales) is continuously above industry average.(Schilke, 2014 )\nControl variables\nTime since BDA adoption Number of years since adoption (#)Country CountryIndustry Type of industryTechnological turbulence Please indicate the degree to which you agree with the following statements.\nTT1. Extent of technological turbulence in the environment.TT2. Leadership in product/process innovation.\nTT3. Impact of new technology on operations.(Brislin, 1970 )\nNotes: (1) * items eliminated due low loading. (2) Items were measured using a 7-point numerical scale (1 is Strongly Disagree and 7 is Strongly Agree).388 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nCorte Real, N., Oliveira, T., & Ruivo, P. (2014). Understanding the hidden value of business\nintelligence and analytics (BI&A). Twentieth American Conference of Information\nSystems . Savannah, Georgia: Association of Information Systems.\nDavenport, T. H. (2006). Competing on analytics. Harvard Business Review ,84,1–12.\nDella Corte, V., & Del Gaudio, G. (2012). Dynamic capabilities: A still unexplored issue\nwith growing complexity. Corporate Ownership and Control ,9,3 2 7 –338.\nDrnevich, P. L., & Kriauciunas, A. P. (2011). Clarifying the conditions and limits of the con-\ntributions of ordinary and dynamic capabilities to relative firm performance. Strategic\nManagement Journal ,32(3), 254 –279.\nElbashir, M. Z., et al. (2013). Enhancing the business value of business intelligence: The role\nof shared knowledge and assimilation. Journal of Information Systems ,27(2), 87 –105.\nErevelles, S., Fukawa, N., & Swayne, L. (2016). Big Data consumer analytics and the trans-\nformation of marketing. Journal of Business Research ,69(2), 897 –904.\nErickson, S., & Rothberg, H. (2015). Big Data and knowledge management: Establishing a\nconceptual foundation. Leading issues in knowledge management. Vol. Two . (pp. 204) 2.\nEuropean_Commission (2015). Towards a thriving data-driven economy. Accessed on:\n30th December 2015]; Available from http://ec.europa.eu/digital-agenda/en/\ntowards-thriving-data-driven-economy#Article\nFornell, C., & Larcker, D. F. (1981). Evaluating structural equation models with unobserv-\nable variables and measurement error. Journal of Marketing Research ,18,3 7 5 –381.\nGefen, D., & Straub, D. (2005). A practical guide to factorial validity using PLS-Graph: Tu-\ntorial and annotated example. Communications of the Association for Information\nSystems ,16(1), 5.\nGoldman, S. L., Nagel, R. N., & Preiss, K. (1995). Agile competitors and virtual organizations:\nStrategies for enriching the customer. Van Nostrand Reinhold.\nGrant, R. M. (1996). Prospering in dynamically-competitive environments: Organization-\nal capability as knowledge integration. Organization Science ,7(4), 375 –387.\nHaas, M. R., & Hansen, M. T. (2005). When using knowledge can hurt performance: The\nvalue of organizational capabilities in a management consulting company. Strategic\nManagement Journal ,26(1), 1 –24.\nHair, J. F., Ringle, C. M., & Sarstedt, M. (2011). PLS-SEM: Indeed a silver bullet. Journal of\nMarketing Theory and Practice ,19(2), 139 –152.\nHair, J. F., Jr., et al. (2013). A primer on partial least squares structural equation modeling\n(PLS-SEM). Sage Publications.\nHelfat, C., & Peteraf, M. (2009). Understanding dynamic capabilities: Progress along a de-\nvelopmental path. Strategic Organization ,7(1), 91.\nHelfat, C. E., et al. (2009). Dynamic capabilities: Understanding strategic change in organiza-\ntions. John Wiley & Sons.\nHenseler, J., Ringle, C. M., & Sinkovics, R. R. (2009). The use of partial least squares path\nmodeling in international marketing. Advances in International Marketing (AIM) ,20,\n277–320.\nIDC (2011). Big Data analytics. Future architectures, skills and roadmaps for the CIO .\nKaisler, S., et al. (2013). Big Data: Issues and challenges moving forward. In system sci-\nences (HICSS). 2013 46th Hawaii International Conference on System Sciences .I E E E .\nKwon, O., Lee, N., & Shin, B. (2014). Data quality management, data usage experience and\nacquisition intention of Big Data analytics. International Journal of Information\nManagement ,34(3), 387 –394.\nLaValle, S., et al. (2011). Big Data, analytics and the path from insights to value. MIT Sloan\nManagement Review ,52(2), 21 –31.\nLiu, H., Song, D., & Cai, Z. (2014). Knowledge management capability and firm performance:\nThe mediating role of organizational agility. PACIS.\nLiu, H., et al. (2013). The impact of IT capabilities on firm performance: The mediating\nroles of absorptive capacity and supply chain agility. Decision Support Systems ,\n54(3), 1452 –1462.\nLorenzoni, G., & Lipparini, A. (1999). The leveraging of inter firm relationships as a distinc-\ntive organizational capability: A longitudinal study. Strategic Management Journal ,\n20(4), 317 –338.\nLu, Y., & Ramamurthy, K. (2011). Understanding the link between information technology\ncapability and organizational agility: An empirical examination. MIS Quarterly ,35(4),\n931–954.\nMalladi, S. (2013). Adoption of business intelligence & analytics in organizations –An em-\npirical study of antecedents. 19th American Conference on Information Systems\n(AMCIS) Chicago, Illinois.\nManyika, J., et al. (2011a). In M.G. Institute (Ed.), Big Data: The next frontier for innovation,\ncompetition and productivity .M c K i n s e yG l o b a lI n s t i t u t e .\nManyika, J., et al. (2011b). Big Data: The next frontier for innovation competition and\nproductivity. McKinsey Global Institute.\nMata, F. J., Fuerst, W. L., & Barney, J. B. (1995). Information technology and sustained\ncompetitive advantage: A resource-based analysis. MIS Quarterly ,19(4), 487 –505.\nMelville, N., Kraemer, K., & Gurbaxani, V. (2004). Information technology and organiza-\ntional performance: An integrative model of IT business value. MIS Quarterly ,28(2),\n283–322.\nMenguc, B., & Auh, S. (2006). Creating a firm-level dynamic capability through capitaliz-\ning on market orientation and innovativeness. Journal of the Academy of Marketing\nScience ,34(1), 63 –73.\nMoore, G. C., & Benbasat, I. (1991). Development of an instrument to measure the percep-\ntions of adopting an information technology innovation. Information Systems\nResearch ,2(3), 192 –222.\nMorabito, V. (2015). Big Data and analytics: Strategic and organizational impacts. Springer.\nNieves, J., & Haller, S. (2014). Building dynamic capabilities through knowledge resources.\nTourism Management ,40,2 2 4 –232.\nNitzl, C., Roldán, J. L., & Cepeda, G. (2016). Mediation analyses in partial least squares\nstructural equation modeling. Helping researchers discuss more sophisticated models\n(pp. 3 –21).\nNonaka, I. (1995). The knowledge-creating company: How Japanese companies create the\ndynamics of innovation. Oxford University Press.Pavlou, P. A., & El Sawy, O. A. (2006). From IT leveraging competence to competitive ad-\nvantage in turbulent environments: The case of new product development.\nInformation Systems Research ,17(3), 198 –227.\nPavlou, P. A., & El Sawy, O. A. (2011). Understanding the elusive black box of dynamic ca-\npabilities. Decision Sciences ,42(1), 239 –273.\nPavlou, P. A., et al. (2005). Measuring the return on information technology: A\nknowledge-based approach for revenue allocation at the process and firm level.\nJournal of the Association for Information Systems ,6(7), 199 –226.\nPeteraf, M. A., & Barney, J. B. (2003). Unraveling the resource-based tangle. Managerial\nand Decision Economics ,24(4), 309 –323.\nPettigrew, A. M., Thomas, H., & Whittington, R. (2001). Handbook of strategy and manage-\nment. Sage.\nPodsakoff, P. M., et al. (2003). Common method biases in behavioral research: A critical\nreview of the literature and recommended remedies. Journal of Applied Psychology ,\n88(5), 879.\nPopovi č, A., et al. (2012). Towards business intelligence systems success: Effects of\nmaturity and culture on analytical decision making. Decision Support Systems ,54,\n729–739.\nPreacher, K. J., & Hayes, A. F. (2008). Asymptotic and resampling strategies for assessing\nand comparing indirect effects in multiple mediator models. Behavior Research\nMethods ,40(3), 879 –891.\nProtogerou, A., Caloghirou, Y., & Lioukas, S. (2012). Dynamic capabilities and their indirect\nimpact on firm performance. Industrial and Corporate Change ,21(3), 615 –647.\nRajpathak, T., & Narsingpurkar, A. (2013). Managing knowledge from Big Data analytics in\nproduct development. Tata Consulting, 11.\nRingle, C. M., Sarstedt, M., & Straub, D. (2012). A critical look at the use of PLS-SEM in MIS\nquarterly. MIS Quarterly (MISQ) ,3 6 ( 1 ) .\nRuggles, R. (1998). The state of the notion: Knowledge management in practice. California\nManagement Review ,40(3), 80 –89.\nR u i v o ,P . ,O l i v e i r a ,T . ,&N e t o ,M .( 2 0 1 4 ) . Examine ERP post-implementation stages of use\nand value: Empirical evidence from Portuguese SMEs. International Journal of\nAccounting Information Systems ,15(2), 166 –184.\nRuivo, P., Oliveira, T., & Neto, M. (2015). Using resource-based view theory to assess the\nvalue of ERP commercial-packages in SMEs. Computers in Industry ,73,1 0 5 –116.\nRussom, P. (2011). Big Data analytics. Fourth Quarter: TDWI Best Practices Report.\nRyans, A. B. (1974). Estimating consumer preferences for a new durable brand in an\nestablished product class. Journal of Marketing Research ,4 3 4 –443.\nSambamurthy, V., Bharadwaj, A., & Grover, V. (2003). Shaping agility through digital op-\ntions: Reconceptualizing the role of information technology in contemporary firms.\nMIS Quarterly ,2 3 7 –263.\nSambamurthy, V., et al. (2007). IT-enabled organizational agility and firms' sustainable\ncompetitive advantage. ICIS 2007 proceedings (pp. 91).\nSaraf, N., Langdon, C. S., & Gosain, S. (2007). IS application capabilities and relational value\nin inter firm partnerships. Information Systems Research ,18(3), 320 –339.\nSAS (2013). Big Data analytics. An assessment of demand for labour and skills, 2012 –2017 .\nSchilke, O. (2014). On the contingent value of dynamic capabilities for competitive advan-\ntage: The nonlinear moderating effect of environmental dynamism. Strategic\nManagement Journal ,35(2), 179 –203.\nSchryen, G. (2013). Revisiting IS business value research: What we already know, what\nwe still need to know, and how we can get there. European Journal of Information Sys-\ntems,22(2), 139 –169.\nSetia, P., Richardson, V., & Smith, R. J. (2015). Business value of partner's IT intensity:\nValue co-creation and appropriation between customers and suppliers. Electronic\nMarkets ,1–16.\nShanks, G., & Bekmamedova, N. (2013). Creating value with business analytics in the sup-\nply chain. European Conference of Information Systems. Utrecht: European Conference\non Information Systems .\nShanks, G., & Sharma, R. (2011). Creating value from business analytics systems: The im-\npact of strategy. 15th Paci fic Asia Conference on Information Systems: Quality Research\nin Paci fic, PACIS 2011 (pp. 1 –12). Queensland: Queensland University of Technology.\nSharma, R., Mithas, S., & Kankanhalli, A. (2014). Transforming decision-making processes:\nA research agenda for understanding the impact of business analytics on organisa-\ntions. European Journal of Information Systems ,23(4), 433 –441.\nS h e r ,P .J . ,&L e e ,V .C .( 2 0 0 4 ) . Information technology as a facilitator for enhancing\ndynamic capabilities through knowledge management. Information & Management ,\n41(8), 933 –945.\nSoh, C., & Markus, M. L. (1995). How IT creates business value: A process theory synthesis.\nInternational Conference of Information Systems . ICIS Proceedings.\nTallon, P. P. (2007). A process-oriented perspective on the alignment of information\ntechnology and business strategy. Journal of Management Information Systems ,\n24(3), 227 –268.\nTallon, P. P., & Pinsonneault, A. (2011). Competing perspectives on the link between stra-\ntegic information technology alignment and organizational agility: Insights from a\nmediation model. MIS Quarterly , 35(2).\nTeece, D. J. (2007). Explicating dynamic capabilities: The nature and microfoundations of\n(sustainable) enterprise performance. Strategic Management Journal ,28(13), 1319 –1350.\nTeece, D., Peteraf, M. A., & Leih, S. (2016). Dynamic capabilities and organizational agility:\nRisk, uncertainty and entrepreneurial management in the innovation economy. Un-\ncertainty and Entrepreneurial Management in the Innovation Economy (April 7, 2016) .\nTeece, D. J., Pisano, G., & Shuen, A. (1997). Dynamic capabilities and strategic manage-\nment. Strategic Management Journal ,18(7), 509 –533.\nVolberda, H. W. (1996). Toward the flexible form: How to remain vital in hypercompet-\nitive environments. Organization Science ,7(4), 359 –374.\nWade, M., & Hulland, J. (2004). Review: The resource-based view and information sys-\ntems research: Review, extension, and suggestions for future research. MIS\nQuarterly ,28(1), 107 –142.389 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390\nWang, C. L., & Ahmed, P. K. (2007). Dynamic capabilities: A review and research agenda.\nInternational Journal of Management Reviews ,9(1), 31 –51.\nWang, E., Klein, G., & Jiang, J. J. (2007). IT support in manufacturing firms for a knowledge\nmanagement dynamic capability link to performance. International Journal of\nProduction Research ,45(11), 2419 –2434.\nWeill, P., Subramani, M., & Broadbent, M. (2002). Building IT infrastructure for strategic\nagility. MIT Sloan Management Review ,44(1), 57.\nWu, L. -Y. (2006). Resources, dynamic capabilities and performance in a dynamic envi-\nronment: Perceptions in Taiwanese IT enterprises. Information & Management ,\n43(4), 447 –454.\nXu, Z., Frankwick, G. L., & Ramirez, E. (2016). Effects of big data analytics and traditional\nmarketing analytics on new product success: A knowledge fusion perspective.\nJournal of Business Research ,69(5), 1562 –1566.Zheng, S., Zhang, W., & Du, J. (2011). Knowledge-based dynamic capabilities and innova-\ntion in networked environments. Journal of Knowledge Management ,15(6),\n1035 –1051.\nZhou, K. Z., & Wu, F. (2010). Technological capability, strategic flexibility, and product in-\nnovation. Strategic Management Journal ,31(5), 547 –561.\nZhu, K., & Kraemer, K. (2005). Post-adoption variations in usage and value of e-business\nby organizations: Cross-country evidence from the retail industry. Information\nSystems Research ,16(1), 61 –84.\nZollo, M., & Winter, S. G. (2002). Deliberate learning and the evolution of dynamic\ncapabilities. Organization Science ,13(3), 339 –351.\nZott, C. (2003). Dynamic capabilities and the emergence of intraindustry differential firm\nperformance: Insights from a simulation study. Strategic Management Journal ,24(2),\n97–125.390 N. Côrte-Real et al. / Journal of Business Research 70 (2017) 379 –390",
"metadata": {
"filename": "Assessing business value of Big Data 2017.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\Assessing business value of Big Data 2017.pdf",
- "file_size": 620244,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:35.562371",
- "content_length": 76355
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\Assessing business value of Big Data 2017.pdf",
+ "size": 620244,
+ "source": "docs_to_import"
+ },
+ "id": "cb2913fe-57a1-489a-8966-be97b8b4a2c0"
},
- "fc9e80fa-1776-4165-b47f-a112395ab0c0": {
- "id": "fc9e80fa-1776-4165-b47f-a112395ab0c0",
- "content": "[Página 1]\nAnnals of Operations Research (2023) 328:1073–1103\nhttps://doi.org/10.1007/s10479-022-04955-2\nORIGINAL RESEARCH\nBig data analytics and the effects of government restrictions\nand prohibitions in the COVID-19 pandemic on emergency\ndepartment sustainable operations\nGörkem Sariyer1·Mustafa Gokalp Ataman2·Sachin Kumar Mangla3·\nYigit Kazancoglu4·Manoj Dora5\nAccepted: 29 August 2022 / Published online: 15 September 2022\n© The Author(s), under exclusive licence to Springer Science+Business Media, LLC, part of Springer Nature 2022\nAbstract\nGrounded in dynamic capabilities, this study mainly aims to model emergency departments’(EDs) sustainable operations in the current situation caused by the COVID-19 pandemic byusing emerging big data analytics (BDA) technologies. Since government may impose somerestrictions and prohibitions in coping with emergencies to protect the functioning of EDs,it also aims to investigate how such policies affect ED operations. The proposed model isdesigned by collecting big data from multiple sources and implementing BDA to transformit into action for providing efficient responses to emergencies. The model is validated inmodeling the daily number of patients, the average daily length of stay (LOS), and dailynumbers of laboratory tests and radiologic imaging tests ordered. It is applied in a case studyrepresenting a large-scale ED. The data set covers a seven-month period which collectivelymeans the periods before COVID-19 and during COVID-19, and includes data from 238,152patients. Comparing statistics on daily patient volumes, average LOS, and resource usage,both before and during the COVID-19 pandemic, we found that patient characteristics anddemographics changed in COVID-19. While 18.92% and 27.22% of the patients requiredlaboratory and radiologic imaging tests before-COVID-19 study period, these percentageswere increased to 31.52% and 39.46% during-COVID-19 study period. By analyzing theeffects of policy-based variables in the model, we concluded that policies might cause sharpdecreases in patient volumes. While the total number of patients arriving before-COVID-19was 158,347, it decreased to 79,805 during-COVID-19. On the other hand, while the averagedaily LOS was 117.53 min before-COVID-19, this value was calculated to be 165,03 min\nB Yigit Kazancoglu\nyigit.kazancoglu@yasar.edu.tr\n1Yasar University, Department of Business Administration, ˙Izmir, Turkey\n2Bakırçay University Çi˘ gli Region Training and Research Hospital, Department of Emergency\nMedicine, ˙Izmir, Turkey\n3Digital Circular Economy for Sustainbale Development Goals (DCE-SDG), Jindal Global Business\nSchool, O P Jindal Global University, Haryana, India\n4Yasar University, Department of Logistics Management, ˙Izmir, Turkey\n5Sustainable Production and Consumption School of Management Anglia Ruskin University, Cambridge,\nUK\n123\n\n[Página 2]\n1074 Annals of Operations Research (2023) 328:1073–1103\nduring-COVID-19 study period. We finally showed that the model had a prediction accuracy\nof between 80 to 95%. While proposing an efficient model for sustainable operations manage-ment in EDs for dynamically changing environments caused by emergencies, it empiricallyinvestigates the impact of different policies on ED operations.\nKeywords Big data analytics ·Emergency department ·COVID-19 ·Machine learning ·\nSustainable operations\n1 Introduction\nMedical scientists and sociologists have widely researched the effects of the COVID-19pandemic on human physical and psychological health. Its impacts on operations and supplychain management have gained significant attention from scholars (Choi, 2021 ; Queiroz et al.,\n2020 ;S a r k i s , 2021 ) and industry experts (Deloitte, 2020 ; Harvard Business Review, 2020 ).\nHowever, although the COVID-19 pandemic has affected operations and supply chains ona large scale and most the companies have faced disruptions (Fortune, 2020 ) since it has\nalso created emergency situations in many countries, its impact on health services is a highpriority and needs to be addressed.\nEfficient and timely service delivery is a significant burden for health services, and the\nimportance of providing rapid responses increases in emergencies. However, as experiencedduring the COVID-19 pandemic, this is very challenging, particularly for EDs, which areincreasingly used as gateways to hospital admissions and have been identified as one ofthe most overcrowded health services units. Besides, since most countries provide a 7/24ED service, non-urgent patients frequently occupy them, which has also been identified asan essential issue leading to increased overcrowding (Ataman & Sariyer, 2021 ). While the\nproblem of overcrowding in EDs is a major challenge for the service providers even in regulartimes (Sariyer & Ataman, 2020 ), pandemic environments push these services into bottlenecks\nsince the number of patients being infected increases uncontrollably. In addition to this sharpincrease in patient volumes, the profiles and demographics of patient admissions to hospitalEDs also vary significantly. Under these circumstances, to protect the functioning of healthservices and EDs, governments are forced to impose widespread restrictions and prohibitions.To cope with the COVID-19 pandemic, the leaders of many countries declared sudden orphased lockdowns and quarantines and the closure of physical shops and businesses, transportbans, etc. Although these may help the functioning of EDs under emergencies and cause asudden decrease in patient volumes, it is crucial for ED service providers to rapidly adapt thesystem in response to such changes and be able to manage operations efficiently in highlydynamic conditions (Alinaghian & Goli, 2017 ; Hossain et al., 2021 ; Mondal & Roy, 2021 ;\nThakur et al., 2021 ). Thus, not only but especially under emergencies, EDs must have strong\ndynamic capabilities to manage these uncertain and dynamically changing environments.\nThese huge patient volumes and the extensive range of patient characteristics also create\nlarge volumes of data for EDs. Thus, these health services are additionally challenged bya ubiquitous context of big data, which has appeared as an exciting frontier of productivityand opportunity (Sanders & Ganeshan, 2018 ). In this era, data is also identified as a valuable\nasset of EDs, enabling insights and decision making (Feng & Shanthikumar, 2018 ). However,\nbig data requires the ability to process and arrange it to be used in decision-making. Thus,although the collected data is precious for EDs, unless they can analyze it and transform itinto useful information that can be turned into rapid action, it cannot go beyond useless data\n123\n\n[Página 3]\nAnnals of Operations Research (2023) 328:1073–1103 1075\nrecording that simply takes up storage capacity. At this point, BDA becomes increasingly\ncrucial for EDs in making efficient and timely decisions in emergency situations.\nThe term ’BDA’ is used to refer to the techniques, technologies, systems, practices,\nmethodologies, and applications for analyzing big data sets and is defined as a holistic processof collecting, managing, and investigating the five major dimensions of data: volume, variety,velocity, veracity, and value (Wamba et al., 2017 ). BDA can support operational and strategic\ndecision-making and turn to action in value creation for all organizational levels and enhanceoperational performance. BDA technologies have been implemented for various operationsand supply chain practices based on their superior performances (Gupta et al., 2021 ;K u m a r\net al., 2016 ,2020 ; Mari´ ce ta l . , 2021 ;M i s h r ae ta l . , 2018 ). In the big data era, BDA can be\nviewed as an organizational capability for EDs to cope with dynamically changing situa-tions. Thus, besides having strong dynamic capabilities, if an ED holds BDA capabilities tomanage big data, it should respond more actively to emergencies, increasing its efficiencyand performance in managing operations. Moreover, big data and BDA implementations inreal-time systems will have great importance in providing sustainable ED operations (Daset al., 2021 ;G o l ie ta l . , 2019 ,2021 ; Midya et al., 2021 ; Mondal & Roy, 2022 ). Having such\ncapabilities and advantages, BDA has attracted researchers, decision, and policymakers incoping with COVID-19 as a current global emergency (Abdel-Basset et al., 2021 ; Bag et al.,\n2021 ; Huang et al., 2020 ; Kapoor et al., 2021 ; Lee & Trimi, 2021 ; Mondal & Roy, 2021 ;\nPapadopoulos et al., 2020 ; Sharma et al., 2020 ; Sözen et al., 2022 ; Tirkolaee et al., 2022 ).\nAlthough these technologies are popular in the COVID-19 context, they have little use in\nthe ED operations decision-making processes in this pandemic period. On the other hand,since EDs are the main actors of health services in managing emergency environments,taking advantage of these technologies to improve EDs’ operations is critical in effectivelymanaging emergencies. Besides, since governmental reactions in fighting COVID-19 havecaused sharp and significant changes in the demand for EDs, investigating the effects of theseactions in EDs operations and putting these effects into account in decision-making modelsis another unique point. Therefore, this study aims to present a model implementing BDAtechnologies for managing four primary ED operations in COVID-19. By conducting inter-views with ED service providers and searching the related literature, the primary operationsthat are challenging for ED services in emergencies and even in regular times are deter-mined as managing daily patient volumes, average stay lengths of patients, and utilizationof laboratory radiologic imaging services. Besides proposing a generic model for managingED operations under emergencies and validating this model for different processes of EDs,taking the governmental actions as the main factors of this model and thus showing how theyaffect these operations is the novelty of this paper. Hence, we aim to answer the followingresearch questions in this paper:\nRQ1. How does BDA assist in making effective decisions for predicting daily patient\nvolumes, average stay lengths of patients, and resource utilization of EDs under dynam-ically changing conditions caused by emergencies?RQ2. How do government-imposed restrictions and prohibitions affect daily patientvolumes, average stay lengths, and ED resource utilization of EDs in emergencies?\nSince the current emergency having worldwide effects is the COVID-19 pandemic, we\nfocus on modeling ED operations during COVID-19 and identify the restrictions and prohi-bitions imposed to cope with this pandemic. To address these research questions, we proposea BDA-driven model and implement machine learning techniques as one of the most potentsub-set of BDA. More specifically, we implement neural networks-based techniques and mul-tilayer perceptron (MLP) algorithms to develop required predictions on daily patient volumes,\n123\n\n[Página 4]\n1076 Annals of Operations Research (2023) 328:1073–1103\naverage stay lengths, and daily utilization of laboratory and imaging services of EDs. In vali-\ndating this model in different ED operations, we define the output variables for each operationas previously stated and identify two sets of factors (input variables). While in the first set, weidentify possible operation-specific factors that may affect the output variable of this oper-ation. We define additional elements representing different types of government restrictionsand prohibitions in the second set. These factors are similarly used for each operation. Withthe proposed model and implemented MLP algorithm by obtaining 80% to 95% accuraciesfor predicting the output values of four ED operations, we answered the RQ1 of this studysince such accurate predictions play a crucial role in making efficient decisions EDs underemergencies. By investigating the significance of the relations between the output variablesand the set of input factors representing the government-imposed restrictions and prohibitionsand analyzing the directions of these relations, we answered the RQ2 of this study.\nThe organization of this paper is as follows. In Sect. 2, we discuss the theoretical back-\nground of this paper. We present the proposed model in Sect. 3and introduce the case study,\nand data set characteristics, data pre-processing steps, and results of the proposed model inSect. 4. Section 5discusses the findings of this study. We present the theoretical, managerial,\nand policy implications in Sect. 6. Section 7offers concluding remarks, limitations of this\nstudy, and the future research directions.\n2 Theoretical background\n2.1 The dynamic capabilities view\nDynamic capabilities define an organization’s ability to innovate, adapt to change, andimprove in a good way for its customers (Teece et al., 2016 ). Zollo and Winter ( 2002 ,\np. 340) defined dynamic capability as a \"learned and stable pattern of collective activitythrough which the organization systematically generates and modifies its operating routinesto pursue improved effectiveness.\"\nThe dynamic capabilities utilize an organization’s internal and external resources in the\nbest possible manner to respond appropriately to environmental uncertainties (Teece et al.,1997 ). Emergencies cause environmental or external uncertainties, and managing opera-\ntions in EDs, particularly under emergencies, requires real-time information whereby serviceproviders can arrive at critical decisions. The dynamic capabilities help integrate primaryresources through the availability of this information and then further help to modify ED oper-ating routines and procedures appropriately. Therefore, we based our research on the dynamiccapability view. Positioning the resources correctly is the prime requisite for coping with theseuncertainties and the chaotic environments related to emergencies. Dynamic capabilities arethe main processes for sensing, integrating, learning, and reconfiguring resources and capa-bilities (Birkinshaw et al., 2016 ) and stress an organization’s capacity to create, extend or\nmodify its resources purposefully. These are also crucial in managing ED operations, par-ticularly in emergencies, since aligning the capabilities and resources and reconfiguring theprocesses may help dynamically deal with changing patient volumes and profiles. To dealwith unexpected increases in patient volumes in COVID-19, many countries reconfiguredtheir health systems, so pandemic services were opened to provide patients. The resourcesand capacities of these services, such as doctors, nurses, and other health staff, required med-ical equipment (medicines, beds, intensive care units, respiratory devices), were provided bymany different hospital departments and mainly from the EDs. In some countries where pan-demic services were not opened, EDs served as these services and encountered COVID-19\n123\n\n[Página 5]\nAnnals of Operations Research (2023) 328:1073–1103 1077\npatients. For such countries, the increased need for medical staff and resources was satisfied\nby reconfiguring the hospital’s other services and aligning them with the pandemic services.\nIn the health services operations and supply chain management literature, many stud-\nies base their theoretical backgrounds on the dynamic capability perspective (Rubbio et al.,2020 ). In the era of big data, health systems are one of the primary services that deal with\nbig data sets of the high volume, variety, and velocity of patient data. Thus, we move furthertowards BDA capability (BDAC), which has evolved from the dynamic capability perspec-tive. We, therefore, highlight the importance of having BDAC for managing health servicesoperations, particularly in emergencies.\n2.2 Big data analytics capability\nDuring the COVID-19 pandemic, BDA has been used to detect surface indicators related tothe pandemic (Guo et al., 2020 ). Real-time big data-driven insights have helped scholars and\ndecision-makers to comprehend the impact of this pandemic. COVID-19 trackers provide anessential source of data to help scholars research and make more informed decisions on copingwith this pandemic by collecting and aggregating big data (Verma & Gustafsson, 2020 ). Such\nsituations increase the volume and the variety of patients’ characteristics in health services.Besides, many external factors may come into play, changing the system dynamics. Undersuch circumstances, it is necessary for health services providers to rapidly adapt the systemto the changing conditions to provide timely and effective services to patients. Thus, the roleof BDAC in healthcare operations gained increased attention (Yu et al., 2021 ).\nWe propose a system for managing ED operations, such as forecasting patient volumes,\nanalyzing patient LOS, and modeling the use of primary resources in emergencies. Even inregular times, the main challenge faced by ED service providers is the overcrowded environ-ment of these services, which creates vast volumes and varieties of patients. An emergencyis an external challenge that may cause an unexpected and sharp increase in patient volumesand varieties, thus straining the system and making managing operations much more difficult.Government is a prominent actor as a system enabler in this era. To protect the functioningof these services and respond to emergencies, governments impose some policies, such asrestrictions and prohibitions, which may cause a sudden decrease in patient volumes but stillchange the characteristics and increase the system’s randomness. All these create dynami-cally changing environments, and the service providers must adopt the system appropriatelyand effectively in response to these rapidly changing conditions. Since by their nature anddue to all these sudden changes, ED services include a huge volume, variety, velocity, andveracity of data, these services may take advantage of BDA to help operations cope withsuch rapid changes in the system. We summarise the theoretical framework of our researchin Fig. 1.\nAs seen in Fig. 1, based on huge volumes, velocities, and varieties of patients, the\ndata inherent in the EDs exhibits a dynamic feature. Since emergencies are also featuredwith rapidly changing conditions, these increase the randomness in the EDs and, therefore,stalemate decision-making processes in EDs. This study attempts to contribute to dynamiccapability theory and BDAC by extending their usage for the decision-making processes ofone of the most important actors of health services, EDs, under emergencies. By presentingthe rapidly changing features of the EDs in emergencies and presenting a model highlightinga need for BDAC, this study aims to contribute to the context of these theories.\n123\n\n[Página 6]\n1078 Annals of Operations Research (2023) 328:1073–1103\nFig. 1 Theoretical framework of this research\n3 Proposed models\nIn this paper, we propose models for managing the primary operations of EDs, particu-\nlarly in emergencies. These models include five main sequential steps: Data Collection,Pre-processing, Modelling, Testing & Model Evaluation, and Providing Managerial & Pol-icy Implications. As discussed earlier, ED environments contain big data sets that can beprocessed with BDA, and valuable information can be obtained in decision-making. Thus,an essential initial step for adapting these emerging technologies into proposed models andsystems is bringing data sets related to the context. A data set can be obtained using differentsources within this research framework. To get the related data of the proposed models, werequired data triangulation. Valuable data sets for the proposed models are secondary datareceived from a case ED covering the period before and during COVID-19; governmentreports; documentary analysis; and interviews with ED service providers. Case study datamay include relevant information about patients arriving at this ED during the study period.Government reports and documentary analyses should be checked to identify the types ofrestrictions and prohibitions imposed by the government to cope with the emergency. Finally,interviews and documents should be used to decide on the main challenges to ED operations,making planning and managing operations more difficult in emergencies. Related metrics andtargeted values of these metrics can also be identified by collecting data through interviewsand a literature search.\nSince the collected data is raw data, which in its current form is not suitable for analyz-\ning and modeling, different data pre-processing tasks must be performed. It is necessary todefine the input and output variables of the model, define the periodicity (hourly, daily, weekly,monthly, etc.) of the analysis, and determine ways to measure the values of the variables. Datatransformation may also involve measuring the values of the variables. One of the main pre-processing tasks in big data studies is cleaning the data set to remove redundant or inappropri-ate data, missing values, and outliers. After all these tasks have been performed, the structureddata set, which can further be processed with BDA tools and techniques, is obtained.\nOnce the structured data set of the model is ready, the modeling step comes next. The\nobtained data set is split into two train and test sets. Train data sets include the values ofall the input and output variables, whereas since the test data set will be used to evaluatethe model’s prediction accuracies, it does not include the values of the output variables. The\n123\n\n[Página 7]\nAnnals of Operations Research (2023) 328:1073–1103 1079\ntrain data set is further processed with machine learning as one of the most widely used BDA\ntechniques. Machine learning presents algorithms to extract knowledge and make efficientdecisions by learning from given data sets. Researchers widely prefer these algorithms basedon their flexibility in using data to capture complex and non-linear behaviors (Choi et al.,2018 ). Among various machine learning algorithms, MLP neural networks have received\nsignificant attention since these are appropriate and efficient for function approximation,pattern classification, and prediction. Incorporating hidden layers between input and outputlayers is one of the other parser properties of these algorithms. When required by extendingthe number of hidden layers, MLP neural networks can expand the number of input featurecombinations to improve the model’s learning ability, finally increasing the prediction power.Although many other BDA techniques have been widely implemented in the literature, themachine learning-based MLP neural network algorithm is integrated into the proposed modelbased on these properties and superiorities.\nThe testing and model evaluation step comes next in the proposed model. The obtained\nMLP algorithm with the optimized parameters is applied to the test data set to get the predictedvalues of the output variables of interest. The predicted values are then compared with theactual values, and the mean errors and accuracies of the prediction should be calculated. Theseperformances should then be compared with the target values. If the targets are achieved orthe model performance goes beyond the targeted one, the model can be proposed for real-life applications. The results on the significance and impacts of government restrictions andprohibitions may also be discussed in detail, and implications should be recommended topolicymakers. Suppose the model performance cannot achieve the targets. In that case, it isnecessary to go back to the data pre-processing step and re-define the model input and outputvariables. The modeling, testing, and evaluation steps must be repeated until proper modelshave been obtained. The proposed model is shown in Fig. 2.\nFig. 2 Flowchart of the proposed model\n123\n\n[Página 8]\n1080 Annals of Operations Research (2023) 328:1073–1103\n4 Case study\n4.1 Case study specification\nWe collected the data set of this study from an ED of a research and training hospital located\nin a metropolitan region in Izmir, Turkey. The daily number of patients or visits to this EDis more than 1,000. This huge patient volume is due to several reasons. First, as mentionedpreviously, overcrowding is a common problem in EDs. Second, due to the vast volumes ofnon-urgent patient visits, this problem can be more severe in some countries, such as Turkey,compared to many other countries. Third, many patients may choose to be treated in thishospital due to its type. Fourth, since this is a public hospital, receiving service from EDs isfree of charge. Fifth, since it is located in a metropolitan region and is very close to publictransport stations and the city center, it is also easily accessible for ambulances. Sixth butnot least, since this ED provides uninterrupted service (7 days and 24 h) while many of theother departments of this hospital provide service only within working hours on weekdays,this causes additional visits of patients of different departments to EDs out of the workinghours. These characteristics created huge volumes, velocities, and varieties in the data set.\nIn Turkey, the first COVID-19 case was reported on March 10, 2020, in Istanbul city, and\nthe virus then spread quickly to the whole country. In Turkey, the COVID-19 was encounteredlater than in many other countries. Thus, public awareness had already been created aboutthis virus and the pandemic. Public awareness was a crucial initial step in coping with thisvirus. Since it first appeared in Turkey, the government started announcing policies like\"social distancing,\" \"hygiene,\" and \"stay at home.\" However, raising public awareness fromthe outset and making announcements was not enough to prevent the spread of the virus.Then, the government imposed other types of restrictions and prohibitions. Restrictions forthe elderly, inter-city transport bans and restrictions for the young were imposed startingfrom the end of March. In addition, starting from the middle of April, total curfews wereimposed at weekends (for two days) and for extended weekends in some of the weeks, whichcould last up to three or four days. The number of cases and deaths started to fall by May.Then the period of normalization began at the beginning of June. Although restrictions andprohibitions were still in use during this month, they were more relaxed.\nHaving high volumes, velocities, and varieties in patient sizes and characteristics, the\nselected ED was identified as proper for this study’s theoretical framework and methodology.Besides, since in different periods (such as before March and during April) and days (suchas weekdays and weekends), government-imposed actions were highly changing during thestudy period, the case ED allowed to investigate the impact of these actions on ED operations.\n4.2 Data set characteristics\nThe data set covers seven months, from December 2019 to June 2020, and includes 238,152patients. Data from between March 10 to the end of June 2020 represents data collected duringthe period of COVID-19’s first peak in Turkey. To have a similar number of days before theCOVID-19 period, the related data set was started in December 2019. Thus, before COVID-19 and during COVID-19 periods cover around 3.5 months of data. For each arriving patient,records of the ED case include the following information: patient ID, gender, age, arrivaltype, triage level, date of arrival, time of arrival, diagnostic tests for treatment-if required,related times for diagnostic tests, assigned diagnosis type by a doctor after treatment, andtime of departure. The patient ID is unique for each patient arrival. Gender is recorded as\n123\n\n[Página 9]\nAnnals of Operations Research (2023) 328:1073–1103 1081\nmale and female. Age is recorded as it is in a continuous form. The arrival type represents\nif a patient arrived by themselves or by ambulance, so it is recorded as one of two options:\"walk-in\" or \"by ambulance.\" When a patient comes to this ED, they are first met by a triagenurse, who triages the patient based on his complaints and clinical acuities. This ED uses the3-level Emergency Severity Index for patient triage.\nFurthermore, trauma patients are treated in a different zone. Thus, arriving patients are\nassigned to one of four zones labeled green, yellow, red, and trauma zones. The arrivaldate represents the full date of the patient’s arrival in a day, month, and year form. Time ofarrival shows the exact time of arrival in an hour, minute, and second form. Many diagnostictests can be ordered in EDs for patient diagnosis. The label of the requested test, and therelated ordering time, approval time, and result time are recorded in the next three rows inan hour, minute, and second form. When doctors diagnose the patients, they assign the typeof diagnosis based on the International Classification of Diagnosis 10th version (ICD-10).Thus, the diagnosis cell includes the diagnosis based on the ICD-10 codes, which can have22 different categories. The last cell consists of the departure time of the patient in an hour,minute, and second form.\nThe data set includes additional attributes to represent government restrictions and prohi-\nbitions. The four main restrictions and prohibitions imposed in Izmir city are considered inthe proposed models. During the COVID-19 study period, total curfew (lockdowns), curfewfor the young (age ≤20), curfew for the elderly (age ≥65), and transport bans were imposed.\nThese are also adopted in the proposed models as model input variables, as discussed in thenext section on data pre-processing.\nAs presented in Fig. 2, selecting the study variables is an important initial step of the\nproposed model. However, it should be kept in mind that these variables are not fixed andrigid and may depend on the selected case studies. Different variables may define the system’sinternal and external dynamics for other cases.\n4.3 Data pre-processing\nWe implement the proposed model with four different ED operations to investigate how theimposed policies have changed and affected the primary operations and resource usage. Thefirst and second operations, Operation 1 and Operation 2, respectively predict the daily num-ber of patients arriving and the average LOS of these patients (LOS is defined as the timebetween the patient’s arrival and their departure) for each day during-COVID-19 period.Different diagnostic tests can be mainly grouped into either laboratory tests or radiologicimaging tests. Thus, we also implement the model for two other operations to analyze theprimary resource usage. Operation 3 and Operation 4 predict daily numbers of ordered labo-ratory tests and radiologic imaging tests for diagnosing patients. Regarding output variablesor attributes of the model for each operation, these are defined adequately as the daily numberof patients, average daily LOS of patients, the daily number of laboratory tests ordered, andthe daily number of radiologic imaging tests ordered during-COVID-19 period.\nSince the aim is to model and manage related daily values, the data set was initially trans-\nformed. In this process, we eliminated the repetitive values from the data set. More than oneICD-10 encoded diagnosis can be assigned to a patient. Different laboratory tests (hemogram,biochemistry, enzyme, hormone, etc.) or radiologic imaging tests (X-ray, tomography, ultra-sound, magnetic resonance imaging, etc.) can also be ordered for a patient with a unique ID.While obtaining the corresponding daily value of the models, we eliminated these repetitiveor redundant values.\n123\n\n[Página 10]\n1082 Annals of Operations Research (2023) 328:1073–1103\nBesides the policy-based attributes, some other input variables were also defined to adopt\nthe system characteristics in the proposed models. These variables were used to represent thesystem dynamics in normal circumstances. Previous studies showed that the day of the weekhas a significant effect on patient volume and LOS (Sarıyer et al., 2020 ). Existing literature\nalso presented that the patient volume, LOS, and numbers of diagnostic tests ordered differedsignificantly between categories of demographic variables (Sarıyer & Ataman, 2020 ). We,\ntherefore, identified these factors as internal factors to represent the ED environment in normalcircumstances. To measure the values of these inputs, we used the study’s data set coveringthe before-COVID-19 period. As in output variables, we made the required transformationsto obtain the daily values of these input variables. The data set is described in Table 1.\nWe performed data pre-processing by dropping missing values in the dataset by using the\ndropna() function of the pandas module in Python. After this, based on standardization, we\nremoved the outliers from the data set by using the zscore() function of the pandas module\nTable 1 Definitions and measurement scales of the model variables\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\n1: Managing daily\nnumbers of patientY1: The daily number of\npatients arriving eachd a yi nt h eduring-COVID-19study period(numerical)X1: The average daily\nnumber of patientsarriving for each day ofthe week—Mondaythrough to Sunday(numerical)Representing govern-\nment restrictions andprohibitions\nX2: The whole curfew\nexists in the day to bepredicted or not (cat-egorical)\nX3: Curfew for young\nexists in the day to bepredicted or not (cat-egorical)\nX4: Curfew for the\nelderly exists in theday to be predictedor not (categorical)\nX5: Transport ban\nexists in the day tobe predicted or not(binary)\n2: Managing daily\naverage LOS ofpatientsY2: Average daily LOS\nof patients arriving eachd a yi nt h eduring-COVID-19study period(numerical)X7-X8: average daily\nLOS of female-malepatients for each day ofthe week (numerical)\nX9-X10-X11: Average\ndaily LOS of agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\n123\n\n[Página 11]\nAnnals of Operations Research (2023) 328:1073–1103 1083\nTable 1 (continued)\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\nX12 through X15:\nAverage daily LOS oftriage groups—red,yellow, green, traumazones—for each day of\nthe week (numerical)\nX16 through X37:\nAverage daily LOS ofICD-10 encodeddiagnosis, for 21\ngroups\n*, for each day\nof the week(numerical)\n3: Managing daily\nnumbers of ordered\nlaboratory testsY3: The daily number of\nlaboratory tests ordered\nin the\nduring-COVID-19study period(numerical)X38-X39: Average daily\nnumbers of laboratory\ntests ordered for\nfemale-male patientsfor each day of theweek (numerical)\nX40-X41-X42: Average\ndaily numbers of\nlaboratory testsordered for agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\nX43-X44: Average daily\nnumbers of laboratorytests ordered for arrivaltype groups—byambulance orwalk-in—for each dayof the week(numerical)\nX45 through X48:\nAverage daily numbersof laboratory testsordered for triagegroups; red, yellow,green, trauma zones,for each day of theweek (numerical)\n123\n\n[Página 12]\n1084 Annals of Operations Research (2023) 328:1073–1103\nTable 1 (continued)\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\nX49 through X69:\nAverage daily numbersof laboratory testsordered for ICD-10encoded diagnosis, for\n21 groups\n*, for each\nday of the week(numerical)Representing system\ndynamics\nX1-fcast: Predicted\ndaily number ofpatients with Model\n1 on each day\nduring-COVID-19study period(numerical) –used in2\nnd,3rd,a n d4th\noperations modeling\n4: Managing daily\nnumbers of orderedradiologic imagingtestsY4: The daily number of\nradiologic imaging testsordered in theduring-COVID-19study period(numerical)X70-X71: Average daily\nnumbers of radiologicimaging tests orderedfor female-malepatients for each day ofthe week (numerical)\nX72-X73-X74: Average\ndaily numbers ofradiologic imagingtests ordered for agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\nX75-X76: Average daily\nnumbers of radiologicimaging tests orderedfor arrival typegroups—by ambulanceor walk-in—for eachday of the week(numerical)\nX77 through X80:\nAverage daily numbersof radiologic imagingtests ordered for triagegroups—red, yellow,green, traumazones—for each day ofthe week (numerical)\nX81 through X101:\nAverage daily numbersof radiologic imagingtests ordered forICD-10 encoded\ndiagnosis, for 21\ngroups\n*, for each day\nof the week(numerical)\n123\n\n[Página 13]\nAnnals of Operations Research (2023) 328:1073–1103 1085\nin Python. We initiated the categorical conversion of the input variables with the Categorical\nclass initializer of the pandas module in Python. We used the Categorical class to encode\nnumerical values as categorized by the capability of initializing the corresponding variableswith categorical values. After these pre-processing steps, we obtained the structured data setfor further modeling with the MLP neural network.\nAs seen in Table 1, we identified the government policies as common input variables in each\noperation to analyze their effects on each of the defined output variables for the correspondingoperations. However, once we predicted the daily number of patients in Operation 1, we usedthese predictions to describe system characteristics in all other models. The daily number ofpatients may affect the average daily LOS, and the number of each diagnostic test ordered.\n5 Results\n5.1 Descriptive results\nThe study period covering the before-COVID-19 period included 100 days of data, andthe total number of patients arriving during these days was 158,347. Laboratory tests wereordered for 29,953 of these patients and 43,106 radiologic imaging tests. On the other hand,the study period covering the during-COVID-19 period included 113 days of data, and thetotal number of patients arriving during these days was 79,805. The number of laboratoryand radiologic imaging tests ordered during this period was 25,154 and 31,488. The averagedaily LOS was 117.53 min in the before-COVID-19 period and 165,03 min in the during-COVID-19 period. Daily values for the number of patients, average LOS, and numbers ofeach type of diagnostic test ordered in the whole study period are depicted in Fig. 3.\nThese results show that while daily and total numbers of patients and diagnostic tests\nordered sharply decreased, average LOS values increased during the during-COVID-19period compared to before-COVID-19. However, although decreases are seen in three ofthe operations’ output variables (1, 3, 4), the sharpest decline was seen in Operation 1’s out-put, the daily number of patients. The decrease in patient numbers may have also caused thedecline in the number of tests ordered. On the other hand, it should be noted that, althoughpatient and diagnostic test numbers decreased, average LOS values increased. All these criti-cal numerical findings could be due to the change in the system dynamics, which were mainlycaused by patients who occupied EDs unnecessarily and did not need an emergency service.\nWe categorized the patients into three groups to support this idea by numerical findings\nconsistent with our model boundaries and comparatively presented the related statistics for\nFig. 3 Daily values of the models’ output variables in the study period\n123\n\n[Página 14]\n1086 Annals of Operations Research (2023) 328:1073–1103\neach of these. These categories were: patients requiring no diagnostic tests, laboratory tests,\nand radiologic imaging tests. Since diagnostic tests are one of the most critical resources fordiagnosing patients, we believe most patients for whom no tests are ordered can representthe cases that occupy EDs for non-urgent conditions.\nFor these categories, the average daily numbers of patients and their average LOS are\nshown for each day of the week before-COVID-19 and during-COVID-19 periods in Fig. 4.\nFigure 4shows that while average daily values for patient numbers decreased in each of the\nthree categories in the during-COVID-19 period compared to the before-COVID-19 period,the majority of the decrease is related to the category of patients requiring no diagnostic test.Although it is worth noting that reductions were seen in the number of patients requiring nodiagnostic test, some increases were seen in their average LOS values in the during-COVID-19 period. This finding mainly supports our hypothesis. On the other hand, at least some\nFig. 4 Daily average patient numbers and LOS values for each day of the week\n123\n\n[Página 15]\nAnnals of Operations Research (2023) 328:1073–1103 1087\ndecreased levels were observed in the average LOS values of patients requiring diagnostic\ntests during the pandemic period. This could be due to the decreases in resource utilization.When resource utilization decreases, it accelerates access to resources and enables moreefficient use. Based on the daily distributions of patient numbers, one other finding should benoted. In the patients requiring no diagnostic test category, while Saturdays and Sundays, thatis, the weekend, had the highest daily patient numbers compared to weekdays in the before-COVID-19 period, daily numbers were the highest on Mondays in the during-COVID-19period. The impact of government restrictions and prohibitions on ED operations is directlyseen in this finding. Since most of the weekends, total curfews were imposed during thisperiod, patient volume, particularly in the patients requiring no diagnostic test category,sharply decreased at weekends.\nTable 2shows the total number of patients arriving at this ED based on the categories of\nthe considered demographics (gender, age, triage, arrival types, diagnosis) for the before-and during-COVID-19 study periods comparatively.\nFrom the values of Table 2, it should be seen that the distribution of patient numbers\nbased on gender changed in the during-COVID-19 period compared to the before-COVID-19period, as the number of male patients increased. Differences were also depicted based on agedistributions. For each of the three categories, in the young group, age:[0–14], patient numbersand distributions sharply decreased in the during-COVID-19 period, and in the elderly group,age≥65. In contrast, distributions fell in the patients requiring diagnostic tests category\noverall. There was some increase in this age category. Additionally, for all three types,the distribution of patients arriving by ambulance increased in the during-COVID-19 studyperiod. Another important finding showed that, while distributions of green zone patientssignificantly decreased in the patients requiring no diagnostic test category, the distributionof green zone patients increased in some other categories. Finally, significant differenceswere observed between 22 different ICD-10 encoded diagnosis types on the distributions ofthe four main groups. These ICD-10 codes were J00-J99 (disease of the respiratory system),M00-M99 (disease of musculoskeletal system and connective tissue), R00-R99 (symptoms,signs, and abnormal clinical and laboratory findings, not elsewhere classified), and U00-U85(codes for special purposes, COVID-19 here). The significant differences in the distributionsof these diagnosis types are associated with the COVID-19 pandemic and the season.\n5.2 Model results\nThe proposed model was implemented in the obtained data sets of the corresponding casestudy. Since we focus on four primary ED operations, the model was tested repetitively fourtimes for Operations 1 through 4, which increased the model’s validity.\nIn this section, the relation between the identified input variables and the corresponding\noutput variables for each ED operation of interest will be presented based on the results ofthe Pearson correlation analysis. The statistical association between the model variables ispresented in a heat-map structure in the Appendix for each operation. In Table 3,w es h o w e d\nthe direction, magnitude, and significance level of the relationships, notably the significantinput variables of the model for each operation.\nFrom the values of Table 3, it is observed that the defined input variables of Operation\n1, X1 through X5, were all significantly related to the output variable Y1. Besides, therelations were in a negative direction. This demonstrates how policy-based restrictions andprohibitions reduce the predicted number of daily patients in the during-COVID-19 period.Nonetheless, while it is observed that the system dynamics related to input variable X1 had a\n123\n\n[Página 16]\n1088 Annals of Operations Research (2023) 328:1073–1103\nTable 2 Distributions of each patient demographic variable for three categories in the before- and during-\nCOVID-19 periods\nVariable Levels Patients requiring no\ndiagnostic testPatients requiring\nlaboratory testsPatients requiring\nradiology tests\nBefore During Before During Before During\nn (%) n (%) n (%) n (%) n (%) n (%)\nGender Female 47,670\n(47.742)14,784\n(42.444)16,636\n(55.540)12,407\n(49.324)21,894\n(51.177)14,899\n(47.316)\nMale 52,179\n(52.258)20,048\n(57.556)13,317\n(44.460)12,747\n(50.676)20,887\n(48.823)16,589\n(52.684)\nAge age: [0–14] 20,722\n(20.753)3,683\n(10.574)4,726\n(15.778)1,715\n(6.818)7,991\n(18.679)2,951\n(9.372)\nage:\n(15–64)70,980\n(71.087)27,538\n(79.059)17,730\n(59.193)18,310\n(72.792)26,814\n(62.677)23,309\n(74.025)\nage≥65 8,147\n(8.159)3,611\n(10.367)7,497\n(25.029)5,129\n(20.390)7,976\n(18.644)5,228\n(16.603)\nTriage\nlevelgreen room 68,335\n(68.438)12,122\n(34.801)2,624\n(8.760)6,490\n(25.801)7,746\n(18.106)7,279\n(23.117)\nyellow\nroom23,212\n(23.247)14,888\n(42.742)20,542\n(68.581)12,037\n(47.853)21,406\n(50.036)12,280\n(38.999)\nred room 2,313\n(2.316)2,076\n(5.960)5,904\n(19.711)5,737\n(22.808)4,833\n(11.297)4,950\n(15.720)\ntrauma\nroom5,989\n(5.998)4,904\n(14.079)883\n(2.948)890\n(3.538)8,796\n(20.561)6,979\n(22.164)\nArrival\ntypewalk in 98,553\n(98.702)33,148\n(95.165)24,508\n(81.822)19,224\n(76.425)37,374\n(87.361)25,642\n(81.434)\nby ambu-\nlance1,296\n(1.298)1,684\n(4.835)5,445\n(18.178)5,930\n(23.575)5,407\n(12.639)5,846\n(18.566)\nICD-10\nencodeddiagno-sisA00-B99 3,095\n(3.100)755\n(2.168)241\n(0.805)193\n(0.767)156\n(0.365)96 (0.305)\nC00-D49 32\n(0.032)24\n(0.069)49\n(0.164)31\n(0.123)43\n(0.101)24 (0.076)\nD50-D89 135\n(0.135)139\n(0.399)75\n(0.250)88\n(0.350)37\n(0.086)51 (0.162)\nE00-E89 108\n(0.108)122\n(0.350)131\n(0.437)117\n(0.465)74\n(0.173)81 (0.257)\nF01-F99 696\n(0.697)515\n(1.479)223\n(0.744)183\n(0.728)132\n(0.309)\n124\n(0.394)\nG00-G99 1,211\n(1.213)540\n(1.550)335\n(1.118)221\n(0.879)415\n(0.970)277\n(0.880)\nH00-H59 646\n(0.647)453\n(1.301)10\n(0.033)7 (0.028) 12\n(0.028)6 (0.019)\n123\n\n[Página 17]\nAnnals of Operations Research (2023) 328:1073–1103 1089\nTable 2 (continued)\nVariable Levels Patients requiring no\ndiagnostic testPatients requiring\nlaboratory testsPatients requiring\nradiology tests\nBefore During Before During Before During\nn (%) n (%) n (%) n (%) n (%) n (%)\nH60-H95 1,541\n(1.543)576\n(1.654)61\n(0.204)36\n(0.143)63\n(0.147)46 (0.146)\nI00-I99 1,113\n(1.115)730\n(2.096)1,192\n(3.980)857\n(3.407)959\n(2.242)715\n(2.271)\nJ00-J99 36,073\n(36.128)5,368\n(15.411)3,174\n(10.597)5,223\n(20.764)4,427\n(10.348)4,913\n(15.603)\nK00-K95 3,925\n(3.931)1,789\n(5.136)1,580\n(5.275)935\n(3.717)1,184\n(2.768)753\n(2.391)\nL00-L99 1,384\n(1.386)1,154\n(3.313)69\n(0.230)67\n(0.266)38\n(0.089)47 (0.149)\nM00-M99 13,190\n(13.210)7,625\n(21.891)2,459\n(8.210)1,933\n(7.685)14,039\n(32.816)8,924\n(28.341)\nN00-N99 2,050\n(2.053)1,206\n(3.462)2,434\n(8.126)1,562\n(6.210)1,673\n(3.911)1,195\n(3.795)\nO00-O9A 28\n(0.028)25\n(0.072)17\n(0.057)18\n(0.072)54\n(0.126)28 (0.089)\nP00-P96 49\n(0.049)51\n(0.146)50\n(0.167)39\n(0.155)5 (0.012) 4 (0.013)\nQ00-Q99 3 (0.003) 5 (0.014) 4 (0.013) 5 (0.020) 5 (0.012) 6 (0.019)\nR00-R99 11,797\n(11.815)3,544\n(10.175)13,110\n(43.769)7,599\n(30.210)12,321\n(28.800)6,957\n(22.094)\nS00-T88 2,556\n(2.560)1,790\n(5.139)193\n(0.644)179\n(0.712)632\n(1.477)537\n(1.705)\nU00-U85 0 (0.000) 644\n(1.849)0 (0.000) 2,106\n(8.372)0 (0.000) 1,971\n(6.260)\nV00-Y99 1,448\n(1.450)1,286\n(3.692)517\n(1.726)426\n(1.694)1,509\n(3.527)801\n(2.544)\nZ00-Z99 18,769\n(18.797)6,491\n(18.635)4,029\n(13.451)3,329\n(13.234)5,003\n(11.694)3,932\n(12.487)\nsignificant relation with the model output variable, the relations of the policy-based variables,\nparticularly X5, X2, and X3, were more substantial. However, for Operation 2, we observedthat most of the selected input variables were not significantly related to Y2. We observedthat only X1-fcast and X5 were related considerably to Y2. As also seen in Table 3,m o s to f\nthe selected input variables of the model were significant while modeling Operations 3 and4. We also observed that some of the selected policy-based variables had significant negativerelations with Y3 and Y4. This result demonstrated that such policies caused substantialdecreases in resource usage of EDs during-COVID-19 period.\nAfter analyzing the effects of the identified input variables on the operations, we further\nprocessed the obtained data sets using the MLP neural networks. MLPRegressor in the neuralnetwork package of the sklearn module in Python was initialized to process the data sets of the\nmodels. The solver function of the algorithm chosen was adam() and the activation function\n123\n\n[Página 18]\n1090 Annals of Operations Research (2023) 328:1073–1103\nTable 3 Correlation results for significant input parameters of the model for each of the operations\nModeling daily patient\nnumbers: Operation 1Modeling average\ndaily LOS: Operation2Modeling daily numbers\nof ordered laboratorytests: Operation 3Modeling daily\nnumbers of orderedradiologic imagingtests: Operation 4\nrY1−X1=-0.25**\nrY1−X2=-0.40**\nrY1−X3=-0.43**\nrY1−X4=-0.22*\nrY1−X5=-0.76**rY2−X1−fc a s t =\n0.18*\nrY2−X5=− 0.29**rY3−X1−fc a s t =\n0.39**\nrY3−X2=-0.36**\nrY3−X38=0.24**\nrY3−X39=0.33**\nrY3−X40=0.19*\nrY3−X41=0.27**\nrY3−X43=0.29**\nrY3−X46=0.33**\nrY3−X49=0.28**\nrY3−X51=0.39**\nrY3−X53=0.19*\nrY3−X55=− 0.28**\nrY3−X58=0.37**\nrY3−X59=0.22\nrY3−X62=− 0.24**\nrY3−X63=− 0.28**\nrY3−X64=− 0.38**\nrY3−X66=0.27**\nrY3−X67=0.39**rY4−X1−fc a s t =\n0.87**\nrY1−X2=-0.42**\nrY1−X3=-0.31**\nrY1−X5=-0.66**\nrY1−X70=0.34**\nrY1−X71=0.36**\nrY1−X72=0.42**\nrY1−X73=0.38**\nrY1−X74=0.30**\nrY1−X75=0.42**\nrY1−X77=0.26**\nrY1−X78=0.40**\nrY1−X79=0.32**\nrY1−X80=0.28**\nrY1−X82=0.32**\nrY1−X83=0.30**\nrY1−X87=0.25**\nrY1−X88=0.19*\nrY1−X90=0.36**\nrY1−X91=0.32**\nrY1−X92=-0.30**\nrY1−X93=0.37**\nrY1−X95=− 0.20*\nrY1−X98=0.30**\nrY1−X99=0.24**\n*Correlation is significant in 95%CI\n**Correlation is significant in 99%CI\nselected was relu() . The train test split was used for experimentation, and the separation was\napplied randomly. The train/test split value of 0.8 was applied. The experiment was repeatedseveral times to obtain the optimal model parameters for learning rate, momentum, and thenumber of hidden layers. The prediction performances of the models were tested on the testdata sets based on the mean absolute percentage error (MAPE), and the root mean squareerror (RMSE) statistics. The optimal model parameters specific to each model and modelperformances are represented in Table 4.\nTable 4shows that the proposed model performs well for managing ED operations in\nthe COVID-19 periods. The model, tested in four different operations, achieved around 90%accuracy in two of these operations and 95% accuracy in one. On the other hand, in one of the\n123\n\n[Página 19]\nAnnals of Operations Research (2023) 328:1073–1103 1091\nTable 4 MLP neural network performances on ED operations predictions during-COVID-19\nED operations during-COVID-19 and\nrelated modelOptimized parameters (learning\nrate-LR, momentum-M, number ofhidden layers-HLModel\nperformance\nMAPE RMSE\nModelling daily patient numbers:\nOperation 1LR=0.01, M =0.01, HL =2 10.573 88.624\nModelling daily average LOS:\nOperation 2LR=0.5, M =0.2, HL =3 19.309 40.473\nModelling daily numbers of ordered\nlaboratory tests: Operation 3LR=0.001, M =0.125, HL =4 9.884 28.325\nModelling daily numbers of ordered\nradiologic imaging tests: Operation 4LR=0.019, M =0.19, HL =3 5.924 20.324\noperations modeling average daily LOS, the model performance was lower, having around\n80% accuracy. The model results are also consistent with the findings on the relationshipbetween model attributes. Since lower relations were observed between variables on LOSmodeling, prediction performance could not achieve the modeling performances on otheroperations with higher correlation levels between the variables. Nonetheless, the achievedaccuracies were still acceptable and practically implementable compared with related studiesand targeted levels.\n6 Discussion\nThis study emphasizes implementing emerging technologies, particularly BDA, in manag-ing health services’ operations. As noted in the literature (Akter & Wamba, 2019 ; Donthu &\nGustaffson, 2020 ), we believe that the challenges posed by COVID-19 can be tackled using\nthese technologies. Grounded in dynamic capabilities and the related context of BDAC, weproposed a model for the management of ED operations in emergencies. To show the valid-ity of the proposed model, we tested it in four different primary operations of EDs. Whiledefining the model variables, besides using the system dynamics-related factors, we imple-mented additional variables to represent the effect of government restrictions and prohibitionsimposed to cope with emergencies. Thus, we contribute to the literature by proposing an effi-cient system for managing ED operations in emergencies by implementing emerging BDAtechnologies and investigating the effects of these policy-based factors on ED operations.\nThe model has been validated using real-life data from a large-scale ED operating in\n˙Izmir city, Turkey. Although the overcrowded environments of EDs are a global problem,\nthis problem is worse in some countries, such as Turkey, in which EDs are frequently occupiedunnecessarily by non-emergent patients. By comparing the daily and total patient volumes inthe before- and during-COVID-19 study periods, the descriptive findings on the case data setmainly represent the significance of this problem in this ED since patient volumes sharplydecreased during-COVID-19 period. By classifying patients into three categories—patientsrequiring no diagnostic tests, laboratory tests, and radiologic imaging tests—and identifyingthat the reduction in patient volume was mainly caused by the first category (patients requir-ing no diagnostic tests), we also provide evidence to support this finding. We additionallysupport this finding by observing increases in the average LOS values of patients who do not\n123\n\n[Página 20]\n1092 Annals of Operations Research (2023) 328:1073–1103\nrequire any diagnostic tests. Contrarily, the average LOS values were observed to decrease\nfor patients requiring diagnostic tests during-COVID-19 period. All these findings demon-strate that most patients make unnecessary visits to this ED. This result supports the existingstudies reporting a substantial decrease in ED visits during the COVID-19 (Jeffery et al.,2020 ; Schereyer et al., 2020 ). We also contribute to the literature by linking this result to one\nof the biggest operational challenges of EDs and demonstrating that unnecessary visits arethe leading cause of overcrowded ED environments. Besides, from the practical viewpoint,the decrease in patient numbers and diagnostic test orders during COVID-19 may be usedfor hospital managers’ better scheduling and allocation of ED resources. Although a sharpdecline was observed in these values, a significant increase was observed in patients’ averageLOS values, meaning that arriving patients to EDs during-COVID-19 required more andlonger interventions and treatments. Thus, better planning and allocation of ED resourceswill be essential for functioning these services during emergencies.\nSignificant decreases in patient volume during-COVID-19 period may be related to two\nmain factors. First, the pandemic created stress in patients. To protect themselves from beinginfected, they may have avoided visiting EDs if they did not have emergent or urgent sit-uations. Second, due to the government restrictions and prohibitions imposed, people werepartially obliged to stay at home if they did not need an emergent or urgent health service.Since the first factor is more behavioral, it is beyond the scope of this study. However, weaimed to identify the impacts of policy-based factors on ED operations by adopting our modelinto a case study representing the overcrowding of ED environments and frequently unneces-sary ED visits. This result supports the existing studies reporting decreased patient volumesdue to the governmental actions taken in fighting COVID-19 (Kendzerska et al., 2021 ; Sözen\net al., 2022 ). It also enhances literature by considering this effect in developing prediction\nmodels for patient volumes, average stay lengths of patients, and resource utilization of EDsduring this pandemic period.\nThe depicted decreases in the average LOS values of patients requiring laboratory or\nradiologic imaging tests in the during-COVID-19 period compared to the before-COVID-19period highlights another essential finding of this study. While this finding has been widelypresented in the literature (Houshyar et al., 2020 ; Jeffery et al., 2020 ), by proposing an\nefficient data-driven model for predicting the daily utilization of these services during thispandemic, once again, this study differs from the existing studies. As an interpretation, itshould be noted that the decrease in the utilization of EDs’ resources accelerates the accessto resources and enables more efficient use of them, and solves another challenge of longwaiting times in EDs.\nA critical step in devising the proposed model was determining the model inputs appro-\npriately. In the case study implementation, input variables are defined in two categories as(i) variables representing system dynamics and (ii) government restrictions and prohibitions.While policy-based variables are defined commonly in implementing the proposed modelfor considered ED operations, system dynamics-based variables are explicitly defined foreach operation. The primary demographics, such as gender, age, triage level, arrival type,and ICD-10 encoded diagnosis in the ED patients’ database, were used and appropriatelytransformed to identify operation-specific input variables. The values of these variables weremeasured based on the data set for the before-COVID-19 study period.\nAfter forming data sets in this manner, the proposed model was tested for the considered\nED operations of managing the daily number of patients, average daily LOS, daily numbersof laboratory tests ordered, and daily numbers of radiologic imaging tests ordered. Whenthe relations between the specified input variables and the daily number of patients during-COVID-19 period were analyzed, it was concluded that policy-based attributes have more\n123\n\n[Página 21]\nAnnals of Operations Research (2023) 328:1073–1103 1093\nsignificant effects on the daily number of patients compared to the identified system dynamics-\nrelated input variables. Some relations were observed between the defined input variables,such as transport bans and restrictions on the elderly, and the daily average LOS during-COVID-19. While policy-based variables, such as total curfew, are related to the daily numberof laboratory tests ordered during-COVID-19 period, some other system dynamics-relatedinput variables also have relations with the corresponding output variable. Finally, bothpolicy-based attributes, namely, curfews and restrictions and transport bans, and most systemdynamics-related variables seemed to relate to the daily number of radiologic imaging testsordered. It is also noted that the depicted correlations between policy-based input variablesand the corresponding output variables had negative signs showing that such policies maydecrease patient volume and the utilization of primary ED resources. From these findings, itis concluded that the restrictions and prohibitions imposed by the government in coping withCOVID-19 have had significant impacts on the management of ED operations. This resultis in line with the existing studies (Akter & Wamba, 2019 ; Haldane & Morgan, 2021 ; Sözen\net al., 2022 ). Our findings contribute to the literature by investigating the effects of system\ndynamics-related and government-imposed actions together and comparatively for differentoperations of EDs.\nThe obtained data sets were then used to implement the proposed model in the four primary\nED operations using MLP neural networks. Neural network algorithms have been presentedin the literature for automatic COVID-19 detection (Qayyum et al., 2021 ) and infection rate\npredictions (Wieczorek et al., 2020 ; Sozen, Sariyer & Ataman, 2021). By implementing this\nalgorithm in multi real-life operations of EDs, the used contexts of this BDA technique havebeen extended in this paper. The model has high prediction accuracies for managing dailypatient numbers and daily use of resources during a pandemic. Besides achieving or exceedingthe prediction performances of models in the literature in this context (Whitt & Zhang, 2019 ),\nthese results achieved the targeted value (85%) set by this ED’s service providers. Althoughthe model’s performance is lower in predicting daily average LOS values, it can still matchthe performance of previous studies (Ataman & Sariyer, 2021) and achieve the targeted valueof 75% accuracy. This operation’s targeted value is smaller than others since modeling LOS ismore complex. Thus, with the proposed model, which utilizes BDA, we believe that even themost challenging health services operations may be managed efficiently, and the difficultiesposed by emergencies can be handled.\n7 Implications\n7.1 Theoretical implications\nThe study underpins the dynamic capability theory in two folds. The emergencies are featuredwith the rapidly changing conditions and parameters. Hence, the data inherent in the crisesexhibits a dynamic feature. Eventually, the properties of the data set are subject to change.Therefore, DC theory arises as an ideal theoretical structure to embrace dynamically changingenvironments caused by emergencies. While such situations cause rapid changes in patientvolumes, varieties, and characteristics, from different viewpoints, the government’s policies,such as restrictions and prohibitions in fighting these situations, create additional modifi-cations in the system environment. For instance, during emergencies caused by pandemicillnesses, volumes of infected patients may significantly increase. The total patient volumein health services may also be decreased due to panic and stress factors created by being\n123\n\n[Página 22]\n1094 Annals of Operations Research (2023) 328:1073–1103\ninfected and based on governmental policies such as stay-home warnings and curfews. All of\nthis support how emergencies create dynamically changing environments. This implicationis strengthened by comparing the main features of the health system data before-COVID19and during-COVID19 periods. Hence, the study’s findings state that DC is applicable inemergencies.\nThe second fold of the theoretical implication can be asserted that dynamically changing\nenvironments caused by emergencies affect decision-making processes. As the propertiesof the data set act in a dynamic manner, it forces the decision-making process to be in linewith this rapid change. Even though the big data nature of the data sets stays the same,the time pressure on the decision-makers is higher due to the fast and dynamic change ofdata. Thus, the need for rapid decision-making increases the need for the capabilities relatedto data analytics. Therefore, BDAC is a crucial structure for building the decision-makingmechanism within emergencies. Once again, the study’s findings support this implication byhighlighting the significant changes in patient volumes, demographics (such as distributionson gender, age, triage, arrival type, and diagnosis categories), and diagnostic test requirements(resource usage) between the before and during pandemic periods. Being aware of changes insuch parameters and having capabilities of shaping ED services rapidly in response to thesechanges provide significant advantages in fighting emergencies. Thus, it can be depicted thatBDAC is applicable in emergencies.\nThus, although dynamic capability theory and the recent view of BDAC have been well\npresented in management literature, this study attempts to extend their usage in the healthcontext, particularly under emergencies. By discussing the rapidly changing parameters andfeatures of the health system environments in emergencies, proposing a model highlightinga need for BDAC, and implementing this model in a real-life big data study, this study aimsto contribute to the context of these theories.\n7.2 Managerial implications\nOur main suggestion is that the decision-makers of health services have BDAC and use bigdata sets of their system environments effectively to create meaningful knowledge, whichshould then be turned rapidly into actions. Adopting the system to dynamically changing con-ditions caused by emergencies quickly and efficiently should be achieved by taking advantageof the emerging technologies and by being able to implement these technologies in practicefor planning and managing operations. Based on the results of this study, we showed howthe current emergency, COVID-19, and the government policies change the patient volumes,varieties, and characteristics. Since such changes may significantly affect ED operations, andbecause it is essential to provide rapid responses to these changing situations, it should alsobe noted that understanding and identifying the main factors that impact their operations iscritical. Suppose system-related factors are characterized and appropriately measured, andexternal factors that may arise from the emergencies are carefully followed and identified.All these factors can be collectively used in modeling ED operations by taking advantageof BDA technologies. Hence, the system may function efficiently even in emergencies. Thechallenges arising in the ED environment and posed by emergencies can be easily managed insuch conditions. Based on such models, the managers will be able to make rapid and correctdecisions and adapt the system efficiently to dynamically changing conditions.\nWe also highlight the importance of data recording in health services. Although BDA and\nBDAC are significant technologies and capabilities for health services and particularly emer-gency departments, all these do not make any sense if there exist no data sets to analyze, create\n123\n\n[Página 23]\nAnnals of Operations Research (2023) 328:1073–1103 1095\nknowledge, and use in decision making. Therefore, we suggest that the ED decision-makers\nfocus on electronic recording and data storage processes and should not avoid investing inthese processes and systems. Since the quantity and quality of the data allow meaningful andactionable knowledge, the decision-makers should spend time and effort testing the quality ofrecording processes. Assuring the existence of valid and reliable big data sets is the primaryprior condition for an ED decision-maker to take advantage of BDA in fighting against thechallenges and uncertainties posed by emergencies. This is also very important for satisfyingthe sustainable monitoring in ED processes and real-time emergency response applications.\n7.3 Policy implications\nThis study mainly emphasized the overcrowded ED environments and the significance of thisproblem in our ED, even regularly. Based on the findings, we noted that this overcrowdingmight be primarily associated with the redundant use of these services, particularly for patientswho occupy them for non-urgent situations. These types of patients generally perceive EDsas gateways to hospitals. To not make an appointment and wait in line for polyclinic servicesor receive a health service at weekends or nights, as EDs provide a 7/24 service, patientsmay choose to visit EDs. However, providing a timely and efficient service becomes morechallenging in these crowded environments based on limited resources. If ED operationscannot be appropriately managed, patients even in emergent and urgent situations may have towait to be treated, which may have significant consequences. To cope with this overcrowdingproblem, different government actions should be taken.\nThis study also analyzes the effects of government restrictions and prohibitions in coping\nwith emergencies, particularly COVID-19. It should be highlighted that imposing these poli-cies is crucial in emergencies to protect the functioning of EDs. Government policies, suchas curfews (lock-downs), transport bans, and partial restrictions on the elderly or the young,may decrease patient volumes, redundant ED visits, and resource utilization.\nIn today’s era that requires awareness of big data and the related contexts of BDA and\nBDAC, we also advise policymakers to invest in data storage and analysis in governmentagencies. Governments must create awareness of these emerging concepts and technologiesin public institutions. Governments should pay time, effort, and budget to regularly controlthe agencies based on their data storage capabilities, qualities, quantities, and reliabilities.It may be necessary to impose sanctions on institutions deficient in these concepts duringthese controls. Creating high-quality, reliable, and robust data sets in government institutionswill improve more accurate and timely decision-making processes in emergency and routinesituations. This may also help governments integrate sustainability orientation in health careoperations and flexibility for managing emergencies.\n8 Conclusion\nWhile emergencies precisely demonstrate dynamically changing environments, health ser-vices are the main actors in coping with those situations. Governments are another leadingactor; they are the enablers of the system and may impose restrictions and prohibitions toprotect the functioning of health services. We, therefore, propose a model, which is groundedin the dynamic capabilities and related context of BDAC, for managing operations of one ofthe most crucial health services units, namely, EDs, during emergencies. With this model,we aim not only to manage ED operations sustainably but also to investigate the effects\n123\n\n[Página 24]\n1096 Annals of Operations Research (2023) 328:1073–1103\nof imposed restrictions and prohibitions on these operations. Besides proposing a generic\nmachine learning integrated model for managing ED operations under emergencies and vali-dating this model for different operations of EDs, taking the governmental actions as the mainfactors of this model and thus showing how they affect these operations is the main contri-bution of this paper. This study also contributes to dynamic capability theory and BDAC byextending their usage for the decision-making processes of one of the most important actorsof health services, EDs, under emergencies. We also believe that the proposed BDA-drivenmodel or more general big data and BDA implementations in real-life operations may helpsatisfy sustainable operations in EDs.\nThe proposed model adopts one of the most popular BDA techniques: multilayer per-\nceptron neural networks. The model is implemented in a real-life data set representing alarge-scale ED with daily patient volumes of more than 1,000. The current COVID-19 pan-demic represents a focused emergency. The model is validated in four different primaryoperations of EDs: managing daily numbers of patients, daily average stays of patients anddaily usage of resources (laboratory services and radiologic imaging services). The predic-tion performance of the proposed model varies between 80 to 95% for the correspondingoperations. This study also showed that policy-based factors might significantly affect EDoperations. Such restrictions and prohibitions may cause sharp decreases in patient volumesand resource utilisations in EDs, which are challenged by overcrowding. Thus, imposingsuch policies is crucial to protect ED functioning in emergencies.\nThe main limitation of this study was that its experimental evaluation was based on data\ncollected from a single case study, and its findings may, therefore, not generalize to emer-gency departments with significantly different patient populations, characteristics, volumes,and varieties. Generalizing these results to other emergency departments with different oper-ational processes, guidelines, and dynamics may also be impossible. Operationally, to ensurerobustness, it is critical to check for variations in patient and system dynamics patternsobserved in this case study to transfer the proposed model to other emergency departments.Future studies should include a broader set of operations, measurements, internal and exter-nal variables, and outcomes from multiple emergency departments to support the robustnessof the proposed model. Finally, we expect that the implementation of deep learning tech-niques can potentially further improve the predictive performance of the proposed model forconsidered operations of EDs.\nAppendix 1\nCorrelation matrices of the identified variables of the models for corresponding ED opera-tions.\nSee Fig. 5.\n123\n\n[Página 25]\nAnnals of Operations Research (2023) 328:1073–1103 1097\nFig. 5 Operation 1: Modelling daily numbers of ED patients during COVID-19\n123\n\n[Página 26]\n1098 Annals of Operations Research (2023) 328:1073–1103\nAppendix 2\nSee Fig. 6.\nFig. 6 Operation 2: Modelling daily average LOS of ED patients during COVID-19\n123\n\n[Página 27]\nAnnals of Operations Research (2023) 328:1073–1103 1099\nAppendix 3\nSee Fig. 7.\nFig. 7 Operation 3: Modelling daily numbers of laboratory tests ordered\n123\n\n[Página 28]\n1100 Annals of Operations Research (2023) 328:1073–1103\nAppendix 4\nSee Fig. 8.\nFig. 8 Operation 4: Modelling daily numbers of radiologic imaging tests ordered\nReferences\nAbdel-Basset, M., Chang, V ., & Nabeeh, N. A. (2021). An intelligent framework using disruptive technologies\nfor COVID-19 analysis. T echnological F orecasting and Social Change, 163 , 120431.\nAkter, S., & Wamba, S. F. (2019). Big data and disaster management: A systematic review and agenda for\nfuture research. Annals of Operations Research, 283 (1), 939–959.\nAlinaghian, M., & Goli, A. (2017). Location, allocation and routing of temporary health centers in rural\nareas in crisis, solved by improved harmony search algorithm. International Journal of Computational\nIntelligence Systems, 10 (1), 894–913.\nAtaman, M. G., & Sarıyer, G. (2021). Predicting waiting and treatment times in emergency departments using\nordinal logistic regression models. The American Journal of Emergency Medicine, 46 , 45–50.\nBag, S., Gupta, S., Choi, T. M., & Kumar, A. (2021). Roles of innovation leadership on using big data analytics\nto establish resilient healthcare supply chains to combat the COVID-19 pandemic: A multimethodologicalstudy. IEEE Transactions on Engineering Management .https://doi.org/10.1109/TEM.2021.3101590\n123\n\n[Página 29]\nAnnals of Operations Research (2023) 328:1073–1103 1101\nBirkinshaw, J., Zimmermann, A., & Raisch, S. (2016). How do firms adapt to discontinuous change? Bridging\nthe dynamic capabilities and ambidexterity perspectives. California Management Review, 58 (4), 36–58.\nChoi, T. M. (2021). Fighting against COVID-19: What operations research can help and the sense-and-respond\nframework. Annals of Operations Research .https://doi.org/10.1007/s10479-021-03973-w\nChoi, T. M., Wallace, S. W., & Wang, Y . (2018). Big data analytics in operations management. Production\nand Operations Management, 27 (10), 1868–1883.\nDas, S. K., Pervin, M., Roy, S. K., & Weber, G. W. (2021). Multi-objective solid transportation-location problem\nwith variable carbon emission in inventory management: A hybrid approach. Annals of Operations\nResearch .https://doi.org/10.1007/s10479-020-03809-z\nDeloitte. (2020). COVID -19: Managing supply chain risk and disruption . Retrieved November 10,\n2020, from https://www2.deloitte.com/global/en/pages/risk/articles/covid-19-managing-supply-chain-\nrisk-anddisruption.html .\nDonthu, N., & Gustafsson, A. (2020). Effects of COVID-19 on business and research. Journal of Business\nResearch, 117 , 284.\nFeng, Q., & Shanthikumar, J. G. (2018). How research in production and operations management may evolve\nin the era of big data. Production and Operations Management, 27 (9), 1670–1684.\nFortune. (2020). 94% of the F ortune 1000 are seeing coronavirus supply chain disruptions: Report .\nRetrieved November 10, 2020, from https://fortune.com/2020/02/21/fortune-1000-coronavirus-china-\nsupply-chain-impact/ .\nGoli, A., Zare, H. K., Tavakkoli-Moghaddam, R., & Sadeghieh, A. (2019). Hybrid artificial intelligence and\nrobust optimization for a multi-objective product portfolio problem Case study: The dairy productsindustry. Computers and Industrial Engineering, 137 , 106090.\nGoli, A., Khademi-Zare, H., Tavakkoli-Moghaddam, R., Sadeghieh, A., Sasanian, M., & Malekalipour\nKordestanizadeh, R. (2021). An integrated approach based on artificial intelligence and novel meta-\nheuristic algorithms to predict demand for dairy products: a case study. Network Computation in Neural\nSystems, 32 (1), 1–35.\nGuo, M., Zhang, Q., Liao, X., Chen, F. Y ., & Zeng, D. D. (2020). A hybrid machine learning framework for\nanalyzing human decision-making through learning preferences. Omega, 101 , 102263.\nGupta, S., Justy, T., Kamboj, S., Kumar, A., & Kristoffersen, E. (2021). Big data and firm marketing per-\nformance: Findings from knowledge-based view. T echnological F orecasting and Social Change, 171 ,\n120986.\nHaldane, V ., & Morgan, G. T. (2021). From resilient to transilient health systems: The deep transformation of\nhealth systems in response to the COVID-19 pandemic. Health Policy and Planning, 36 (1), 134–135.\nHarvard Business Review. (2020). Coronavirus is proving we need more resilient supply chains . Retrieved\nNovember 5, 2020, from https://hbr.org/2020/03/coronavirus-is-proving-that-we-need-moreresilient-\nsupply-chains .\nHossain, M. K., Thakur, V ., & Mangla, S. K. (2021). Modeling the emergency healthcare supply chains:\nResponding to the COVID-19 pandemic. Journal of Business and Industrial Marketing .https://doi.org/\n10.1108/JBIM-07-2020-0315\nHoushyar, R., Tran-Harding, K., Glavis-Bloom, J., Nguyentat, M., Mongan, J., Chahine, C., Loehfelm, T.\nW., Kohli, M. D., Zaragoza, E. J., Murphy, P. M., & Kampalath, R. (2020). Effect of shelter-in-place\non emergency department radiology volumes during the COVID-19 pandemic. Emergency radiology,\n27(6), 781–784.\nHuang, H., Peng, Z., Wu, H., & Xie, Q. (2020). A big data analysis on the five dimensions of emergency\nmanagement information in the early stage of COVID-19 in China. Journal of Chinese Governance,\n5(2), 213–233.\nJeffery, M. M., D’onofrio, G., Paek, H., Platts-Mills, T. F., Soares, W. E., Hoppe, J. A., Genes, N., Nath, B.,\n& Melnick, E. R. (2020). Trends in emergency department visits and hospital admissions in health caresystems in 5 states in the first months of the COVID-19 pandemic in the US. JAMA internal medicine,\n180(10), 1328–1333.\nKapoor, K., Bigdeli, A. Z., Dwivedi, Y . K., & Raman, R. (2021). How is COVID-19 altering the manufac-\nturing landscape? A literature review of imminent challenges and management interventions. Annals of\nOperations Research .\nhttps://doi.org/10.1007/s10479-021-04397-2\nKendzerska, T., Zhu, D. T., Gershon, A. S., Edwards, J. D., Peixoto, C., Robillard, R., & Kendall, C. E. (2021).\nThe effects of the health system response to the COVID-19 pandemic on chronic disease management:A narrative review. Risk Management and Healthcare Policy, 14 , 575.\nKumar, A., Shankar, R., Choudhary, A., & Thakur, L. S. (2016). A big data MapReduce framework for\nfault diagnosis in cloud-based manufacturing. International Journal of Production Research, 54 (23),\n7060–7073.\n123\n\n[Página 30]\n1102 Annals of Operations Research (2023) 328:1073–1103\nKumar, A., Shankar, R., & Aljohani, N. R. (2020). A big data driven framework for demand-driven forecasting\nwith effects of marketing-mix variables. Industrial Marketing Management, 90 , 493–507.\nLee, S. M., & Trimi, S. (2021). Convergence innovation in the digital age and in the COVID-19 pandemic\ncrisis. Journal of Business Research, 123 , 14–22.\nMari´ c, J., Galera-Zarco, C., & Opazo-Basáez, M. (2021). The emergent role of digital technologies in the\ncontext of humanitarian supply chains: A systematic literature review. Annals of Operations Research .\nhttps://doi.org/10.1007/s10479-021-04079-z\nMidya, S., Roy, S. K., & Yu, V . F. (2021). Intuitionistic fuzzy multi-stage multi-objective fixed-charge solid\ntransportation problem in a green supply chain. International Journal of Machine Learning and Cyber-\nnetics, 12 (3), 699–717.\nMishra, D., Gunasekaran, A., Papadopoulos, T., & Childe, S. J. (2018). Big Data and supply chain management:\nA review and bibliometric analysis. Annals of Operations Research, 270 (1), 313–336.\nMondal, A., & Roy, S. K. (2021). Multi-objective sustainable opened-and closed-loop supply chain under\nmixed uncertainty during COVID-19 pandemic situation. Computers & Industrial Engineering, 159 ,\n107453.\nMondal, A., & Roy, S. K. (2022). Application of Choquet integral in interval type-2 Pythagorean fuzzy\nsustainable supply chain management under risk. International Journal of Intelligent Systems, 37 (1),\n217–263.\nPapadopoulos, T., Baltas, K. N., & Balta, M. E. (2020). The use of digital technologies by small and medium\nenterprises during COVID-19: Implications for theory and practice. International Journal of Information\nManagement, 55 , 102192.\nSarkis, J. (2021). Supply chain sustainability: Learning from the COVID-19 pandemic. International Journal\nof Operations & Production Management, 41 (1), 63–73.\nSchreyer, K. E., Daniel, A., King, L. L., Blome, A., DeAngelis, M., Stauffer, K., Desrochers, K., Donahue, W.,\nPolitarhos, N., Raab, C., & McNamara, R. (2020). Emergency department management of the Covid-19\npandemic. The Journal of emergency medicine, 59 (6), 946–951.\nThakur, V ., Mangla, S. K., & Tiwari, B. (2021). Managing healthcare waste for sustainable environmental\ndevelopment: A hybrid decision approach. Business Strategy and the Environment, 30 (1), 357–373.\nTirkolaee, E. B., Goli, A., Ghasemi, P., & Goodarzian, F. (2022). Designing a sustainable closed-loop supply\nchain network of face masks during the COVID-19 pandemic: Pareto-based algorithms. Journal of\nCleaner Production, 333 , 130056.\nQayyum, A., Razzak, I., Tanveer, M., & Kumar, A. (2021). Depth-wise dense neural network for automatic\nCOVID19 infection detection and diagnosis. Annals of Operations Research .https://doi.org/10.1007/\ns10479-021-04154-5\nQueiroz, M. M., Ivanov, D., Dolgui, A., & Wamba, S. F. (2020). Impacts of epidemic outbreaks on supply\nchains: Mapping a research agenda amid the COVID-19 pandemic through a structured literature review.Annals of Operations Research .https://doi.org/10.1007/s10479-020-03685-7\nSanders, N. R., & Ganeshan, R. (2018). Big data in supply chain management. Production and Operations\nManagement, 27 (10), 1745–1748.\nSarıyer, G., & Ataman, M. G. (2020). The likelihood of requiring a diagnostic test: Classifying emergency\ndepartment patients with logistic regression. Health Information Management Journal, 51 (1), 13–22.\nSarıyer, G., Ataman, M. G., & Kızılo˘ glu, ˙I. (2020). Analyzing main and interaction effects of length of stay\ndeterminants in emergency departments. International Journal of Health Policy and Management, 9 (5),\n198–205.\nSözen, M. E., Sarıyer, G., & Ataman, M. G. (2022). Big data analytics and COVID-19: Investigating the\nrelationship between government policies and cases in Poland, Turkey, and South Korea. Health Policy\nand Planning, 37 (1), 100–111.\nSharma, M., Luthra, S., Joshi, S., & Kumar, A. (2020). Developing a framework for enhancing survivability\nof sustainable supply chains during and post-COVID-19 pandemic. International Journal of Logistics\nResearch and Applications, 25 (4–5), 433–453.\nRubbio, I., Bruccoleri, M., Pietrosi, A., & Ragonese, B. (2020). Digital health technology enhances resilient\nbehaviour: Evidence from the ward. International Journal of Operations and Production Management,\n40(1), 34–67.\nTeece, D. J., Pisano, G., & Shuen, A. (1997). Dynamic capabilities and strategic management. Strategic\nManagement Journal, 18\n(7), 509–533.\nTeece, D., Peteraf, M., & Leih, S. (2016). Dynamic capabilities and organizational agility: Risk, uncertainty,\nand strategy in the innovation economy. California Management Review, 58 (4), 13–35.\nVerma, S., & Gustafsson, A. (2020). Investigating the emerging COVID-19 research trends in the field of busi-\nness and management: A bibliometric analysis approach. Journal of Business Research, 118 , 253–261.\n123\n\n[Página 31]\nAnnals of Operations Research (2023) 328:1073–1103 1103\nWamba, S. F., Gunasekaran, A., Akter, S., Ren, S. J. F., Dubey, R., & Childe, S. J. (2017). Big data analytics\nand firm performance: Effects of dynamic capabilities. Journal of Business Research, 70 , 356–365.\nWhitt, W., & Zhang, X. (2019). Forecasting arrivals and occupancy levels in an emergency department.\nOperations Research for Health Care, 21 , 1–18.\nWieczorek, M., Siłka, J., & Wo´ zniak, M. (2020). Neural network powered COVID-19 spread forecasting\nmodel. Chaos, Solitons & Fractals, 140 , 110203.\nYu, W., Zhao, G., Liu, Q., & Song, Y . (2021). Role of big data analytics capability in developing integrated\nhospital supply chains and operational flexibility: An organizational information processing theory per-\nspective. T echnological F orecasting and Social Change, 163 , 120417.\nZollo, M., & Winter, S. G. (2002). Deliberate learning and the evolution of dynamic capabilities. Organization\nScience, 13 (3), 339–351.\nPublisher’s Note Springer Nature remains neutral with regard to jurisdictional claims in published maps and\ninstitutional affiliations.\nSpringer Nature or its licensor holds exclusive rights to this article under a publishing agreement with the\nauthor(s) or other rightsholder(s); author self-archiving of the accepted manuscript version of this article issolely governed by the terms of such publishing agreement and applicable law.\n123",
+ "2bca5cca-f44c-4503-bbd0-551892538300": {
+ "content": "Annals of Operations Research (2023) 328:1073–1103\nhttps://doi.org/10.1007/s10479-022-04955-2\nORIGINAL RESEARCH\nBig data analytics and the effects of government restrictions\nand prohibitions in the COVID-19 pandemic on emergency\ndepartment sustainable operations\nGörkem Sariyer1·Mustafa Gokalp Ataman2·Sachin Kumar Mangla3·\nYigit Kazancoglu4·Manoj Dora5\nAccepted: 29 August 2022 / Published online: 15 September 2022\n© The Author(s), under exclusive licence to Springer Science+Business Media, LLC, part of Springer Nature 2022\nAbstract\nGrounded in dynamic capabilities, this study mainly aims to model emergency departments’(EDs) sustainable operations in the current situation caused by the COVID-19 pandemic byusing emerging big data analytics (BDA) technologies. Since government may impose somerestrictions and prohibitions in coping with emergencies to protect the functioning of EDs,it also aims to investigate how such policies affect ED operations. The proposed model isdesigned by collecting big data from multiple sources and implementing BDA to transformit into action for providing efficient responses to emergencies. The model is validated inmodeling the daily number of patients, the average daily length of stay (LOS), and dailynumbers of laboratory tests and radiologic imaging tests ordered. It is applied in a case studyrepresenting a large-scale ED. The data set covers a seven-month period which collectivelymeans the periods before COVID-19 and during COVID-19, and includes data from 238,152patients. Comparing statistics on daily patient volumes, average LOS, and resource usage,both before and during the COVID-19 pandemic, we found that patient characteristics anddemographics changed in COVID-19. While 18.92% and 27.22% of the patients requiredlaboratory and radiologic imaging tests before-COVID-19 study period, these percentageswere increased to 31.52% and 39.46% during-COVID-19 study period. By analyzing theeffects of policy-based variables in the model, we concluded that policies might cause sharpdecreases in patient volumes. While the total number of patients arriving before-COVID-19was 158,347, it decreased to 79,805 during-COVID-19. On the other hand, while the averagedaily LOS was 117.53 min before-COVID-19, this value was calculated to be 165,03 min\nB Yigit Kazancoglu\nyigit.kazancoglu@yasar.edu.tr\n1Yasar University, Department of Business Administration, ˙Izmir, Turkey\n2Bakırçay University Çi˘ gli Region Training and Research Hospital, Department of Emergency\nMedicine, ˙Izmir, Turkey\n3Digital Circular Economy for Sustainbale Development Goals (DCE-SDG), Jindal Global Business\nSchool, O P Jindal Global University, Haryana, India\n4Yasar University, Department of Logistics Management, ˙Izmir, Turkey\n5Sustainable Production and Consumption School of Management Anglia Ruskin University, Cambridge,\nUK\n123\n1074 Annals of Operations Research (2023) 328:1073–1103\nduring-COVID-19 study period. We finally showed that the model had a prediction accuracy\nof between 80 to 95%. While proposing an efficient model for sustainable operations manage-ment in EDs for dynamically changing environments caused by emergencies, it empiricallyinvestigates the impact of different policies on ED operations.\nKeywords Big data analytics ·Emergency department ·COVID-19 ·Machine learning ·\nSustainable operations\n1 Introduction\nMedical scientists and sociologists have widely researched the effects of the COVID-19pandemic on human physical and psychological health. Its impacts on operations and supplychain management have gained significant attention from scholars (Choi, 2021 ; Queiroz et al.,\n2020 ;S a r k i s , 2021 ) and industry experts (Deloitte, 2020 ; Harvard Business Review, 2020 ).\nHowever, although the COVID-19 pandemic has affected operations and supply chains ona large scale and most the companies have faced disruptions (Fortune, 2020 ) since it has\nalso created emergency situations in many countries, its impact on health services is a highpriority and needs to be addressed.\nEfficient and timely service delivery is a significant burden for health services, and the\nimportance of providing rapid responses increases in emergencies. However, as experiencedduring the COVID-19 pandemic, this is very challenging, particularly for EDs, which areincreasingly used as gateways to hospital admissions and have been identified as one ofthe most overcrowded health services units. Besides, since most countries provide a 7/24ED service, non-urgent patients frequently occupy them, which has also been identified asan essential issue leading to increased overcrowding (Ataman & Sariyer, 2021 ). While the\nproblem of overcrowding in EDs is a major challenge for the service providers even in regulartimes (Sariyer & Ataman, 2020 ), pandemic environments push these services into bottlenecks\nsince the number of patients being infected increases uncontrollably. In addition to this sharpincrease in patient volumes, the profiles and demographics of patient admissions to hospitalEDs also vary significantly. Under these circumstances, to protect the functioning of healthservices and EDs, governments are forced to impose widespread restrictions and prohibitions.To cope with the COVID-19 pandemic, the leaders of many countries declared sudden orphased lockdowns and quarantines and the closure of physical shops and businesses, transportbans, etc. Although these may help the functioning of EDs under emergencies and cause asudden decrease in patient volumes, it is crucial for ED service providers to rapidly adapt thesystem in response to such changes and be able to manage operations efficiently in highlydynamic conditions (Alinaghian & Goli, 2017 ; Hossain et al., 2021 ; Mondal & Roy, 2021 ;\nThakur et al., 2021 ). Thus, not only but especially under emergencies, EDs must have strong\ndynamic capabilities to manage these uncertain and dynamically changing environments.\nThese huge patient volumes and the extensive range of patient characteristics also create\nlarge volumes of data for EDs. Thus, these health services are additionally challenged bya ubiquitous context of big data, which has appeared as an exciting frontier of productivityand opportunity (Sanders & Ganeshan, 2018 ). In this era, data is also identified as a valuable\nasset of EDs, enabling insights and decision making (Feng & Shanthikumar, 2018 ). However,\nbig data requires the ability to process and arrange it to be used in decision-making. Thus,although the collected data is precious for EDs, unless they can analyze it and transform itinto useful information that can be turned into rapid action, it cannot go beyond useless data\n123\nAnnals of Operations Research (2023) 328:1073–1103 1075\nrecording that simply takes up storage capacity. At this point, BDA becomes increasingly\ncrucial for EDs in making efficient and timely decisions in emergency situations.\nThe term ’BDA’ is used to refer to the techniques, technologies, systems, practices,\nmethodologies, and applications for analyzing big data sets and is defined as a holistic processof collecting, managing, and investigating the five major dimensions of data: volume, variety,velocity, veracity, and value (Wamba et al., 2017 ). BDA can support operational and strategic\ndecision-making and turn to action in value creation for all organizational levels and enhanceoperational performance. BDA technologies have been implemented for various operationsand supply chain practices based on their superior performances (Gupta et al., 2021 ;K u m a r\net al., 2016 ,2020 ; Mari´ ce ta l . , 2021 ;M i s h r ae ta l . , 2018 ). In the big data era, BDA can be\nviewed as an organizational capability for EDs to cope with dynamically changing situa-tions. Thus, besides having strong dynamic capabilities, if an ED holds BDA capabilities tomanage big data, it should respond more actively to emergencies, increasing its efficiencyand performance in managing operations. Moreover, big data and BDA implementations inreal-time systems will have great importance in providing sustainable ED operations (Daset al., 2021 ;G o l ie ta l . , 2019 ,2021 ; Midya et al., 2021 ; Mondal & Roy, 2022 ). Having such\ncapabilities and advantages, BDA has attracted researchers, decision, and policymakers incoping with COVID-19 as a current global emergency (Abdel-Basset et al., 2021 ; Bag et al.,\n2021 ; Huang et al., 2020 ; Kapoor et al., 2021 ; Lee & Trimi, 2021 ; Mondal & Roy, 2021 ;\nPapadopoulos et al., 2020 ; Sharma et al., 2020 ; Sözen et al., 2022 ; Tirkolaee et al., 2022 ).\nAlthough these technologies are popular in the COVID-19 context, they have little use in\nthe ED operations decision-making processes in this pandemic period. On the other hand,since EDs are the main actors of health services in managing emergency environments,taking advantage of these technologies to improve EDs’ operations is critical in effectivelymanaging emergencies. Besides, since governmental reactions in fighting COVID-19 havecaused sharp and significant changes in the demand for EDs, investigating the effects of theseactions in EDs operations and putting these effects into account in decision-making modelsis another unique point. Therefore, this study aims to present a model implementing BDAtechnologies for managing four primary ED operations in COVID-19. By conducting inter-views with ED service providers and searching the related literature, the primary operationsthat are challenging for ED services in emergencies and even in regular times are deter-mined as managing daily patient volumes, average stay lengths of patients, and utilizationof laboratory radiologic imaging services. Besides proposing a generic model for managingED operations under emergencies and validating this model for different processes of EDs,taking the governmental actions as the main factors of this model and thus showing how theyaffect these operations is the novelty of this paper. Hence, we aim to answer the followingresearch questions in this paper:\nRQ1. How does BDA assist in making effective decisions for predicting daily patient\nvolumes, average stay lengths of patients, and resource utilization of EDs under dynam-ically changing conditions caused by emergencies?RQ2. How do government-imposed restrictions and prohibitions affect daily patientvolumes, average stay lengths, and ED resource utilization of EDs in emergencies?\nSince the current emergency having worldwide effects is the COVID-19 pandemic, we\nfocus on modeling ED operations during COVID-19 and identify the restrictions and prohi-bitions imposed to cope with this pandemic. To address these research questions, we proposea BDA-driven model and implement machine learning techniques as one of the most potentsub-set of BDA. More specifically, we implement neural networks-based techniques and mul-tilayer perceptron (MLP) algorithms to develop required predictions on daily patient volumes,\n123\n1076 Annals of Operations Research (2023) 328:1073–1103\naverage stay lengths, and daily utilization of laboratory and imaging services of EDs. In vali-\ndating this model in different ED operations, we define the output variables for each operationas previously stated and identify two sets of factors (input variables). While in the first set, weidentify possible operation-specific factors that may affect the output variable of this oper-ation. We define additional elements representing different types of government restrictionsand prohibitions in the second set. These factors are similarly used for each operation. Withthe proposed model and implemented MLP algorithm by obtaining 80% to 95% accuraciesfor predicting the output values of four ED operations, we answered the RQ1 of this studysince such accurate predictions play a crucial role in making efficient decisions EDs underemergencies. By investigating the significance of the relations between the output variablesand the set of input factors representing the government-imposed restrictions and prohibitionsand analyzing the directions of these relations, we answered the RQ2 of this study.\nThe organization of this paper is as follows. In Sect. 2, we discuss the theoretical back-\nground of this paper. We present the proposed model in Sect. 3and introduce the case study,\nand data set characteristics, data pre-processing steps, and results of the proposed model inSect. 4. Section 5discusses the findings of this study. We present the theoretical, managerial,\nand policy implications in Sect. 6. Section 7offers concluding remarks, limitations of this\nstudy, and the future research directions.\n2 Theoretical background\n2.1 The dynamic capabilities view\nDynamic capabilities define an organization’s ability to innovate, adapt to change, andimprove in a good way for its customers (Teece et al., 2016 ). Zollo and Winter ( 2002 ,\np. 340) defined dynamic capability as a \"learned and stable pattern of collective activitythrough which the organization systematically generates and modifies its operating routinesto pursue improved effectiveness.\"\nThe dynamic capabilities utilize an organization’s internal and external resources in the\nbest possible manner to respond appropriately to environmental uncertainties (Teece et al.,1997 ). Emergencies cause environmental or external uncertainties, and managing opera-\ntions in EDs, particularly under emergencies, requires real-time information whereby serviceproviders can arrive at critical decisions. The dynamic capabilities help integrate primaryresources through the availability of this information and then further help to modify ED oper-ating routines and procedures appropriately. Therefore, we based our research on the dynamiccapability view. Positioning the resources correctly is the prime requisite for coping with theseuncertainties and the chaotic environments related to emergencies. Dynamic capabilities arethe main processes for sensing, integrating, learning, and reconfiguring resources and capa-bilities (Birkinshaw et al., 2016 ) and stress an organization’s capacity to create, extend or\nmodify its resources purposefully. These are also crucial in managing ED operations, par-ticularly in emergencies, since aligning the capabilities and resources and reconfiguring theprocesses may help dynamically deal with changing patient volumes and profiles. To dealwith unexpected increases in patient volumes in COVID-19, many countries reconfiguredtheir health systems, so pandemic services were opened to provide patients. The resourcesand capacities of these services, such as doctors, nurses, and other health staff, required med-ical equipment (medicines, beds, intensive care units, respiratory devices), were provided bymany different hospital departments and mainly from the EDs. In some countries where pan-demic services were not opened, EDs served as these services and encountered COVID-19\n123\nAnnals of Operations Research (2023) 328:1073–1103 1077\npatients. For such countries, the increased need for medical staff and resources was satisfied\nby reconfiguring the hospital’s other services and aligning them with the pandemic services.\nIn the health services operations and supply chain management literature, many stud-\nies base their theoretical backgrounds on the dynamic capability perspective (Rubbio et al.,2020 ). In the era of big data, health systems are one of the primary services that deal with\nbig data sets of the high volume, variety, and velocity of patient data. Thus, we move furthertowards BDA capability (BDAC), which has evolved from the dynamic capability perspec-tive. We, therefore, highlight the importance of having BDAC for managing health servicesoperations, particularly in emergencies.\n2.2 Big data analytics capability\nDuring the COVID-19 pandemic, BDA has been used to detect surface indicators related tothe pandemic (Guo et al., 2020 ). Real-time big data-driven insights have helped scholars and\ndecision-makers to comprehend the impact of this pandemic. COVID-19 trackers provide anessential source of data to help scholars research and make more informed decisions on copingwith this pandemic by collecting and aggregating big data (Verma & Gustafsson, 2020 ). Such\nsituations increase the volume and the variety of patients’ characteristics in health services.Besides, many external factors may come into play, changing the system dynamics. Undersuch circumstances, it is necessary for health services providers to rapidly adapt the systemto the changing conditions to provide timely and effective services to patients. Thus, the roleof BDAC in healthcare operations gained increased attention (Yu et al., 2021 ).\nWe propose a system for managing ED operations, such as forecasting patient volumes,\nanalyzing patient LOS, and modeling the use of primary resources in emergencies. Even inregular times, the main challenge faced by ED service providers is the overcrowded environ-ment of these services, which creates vast volumes and varieties of patients. An emergencyis an external challenge that may cause an unexpected and sharp increase in patient volumesand varieties, thus straining the system and making managing operations much more difficult.Government is a prominent actor as a system enabler in this era. To protect the functioningof these services and respond to emergencies, governments impose some policies, such asrestrictions and prohibitions, which may cause a sudden decrease in patient volumes but stillchange the characteristics and increase the system’s randomness. All these create dynami-cally changing environments, and the service providers must adopt the system appropriatelyand effectively in response to these rapidly changing conditions. Since by their nature anddue to all these sudden changes, ED services include a huge volume, variety, velocity, andveracity of data, these services may take advantage of BDA to help operations cope withsuch rapid changes in the system. We summarise the theoretical framework of our researchin Fig. 1.\nAs seen in Fig. 1, based on huge volumes, velocities, and varieties of patients, the\ndata inherent in the EDs exhibits a dynamic feature. Since emergencies are also featuredwith rapidly changing conditions, these increase the randomness in the EDs and, therefore,stalemate decision-making processes in EDs. This study attempts to contribute to dynamiccapability theory and BDAC by extending their usage for the decision-making processes ofone of the most important actors of health services, EDs, under emergencies. By presentingthe rapidly changing features of the EDs in emergencies and presenting a model highlightinga need for BDAC, this study aims to contribute to the context of these theories.\n123\n1078 Annals of Operations Research (2023) 328:1073–1103\nFig. 1 Theoretical framework of this research\n3 Proposed models\nIn this paper, we propose models for managing the primary operations of EDs, particu-\nlarly in emergencies. These models include five main sequential steps: Data Collection,Pre-processing, Modelling, Testing & Model Evaluation, and Providing Managerial & Pol-icy Implications. As discussed earlier, ED environments contain big data sets that can beprocessed with BDA, and valuable information can be obtained in decision-making. Thus,an essential initial step for adapting these emerging technologies into proposed models andsystems is bringing data sets related to the context. A data set can be obtained using differentsources within this research framework. To get the related data of the proposed models, werequired data triangulation. Valuable data sets for the proposed models are secondary datareceived from a case ED covering the period before and during COVID-19; governmentreports; documentary analysis; and interviews with ED service providers. Case study datamay include relevant information about patients arriving at this ED during the study period.Government reports and documentary analyses should be checked to identify the types ofrestrictions and prohibitions imposed by the government to cope with the emergency. Finally,interviews and documents should be used to decide on the main challenges to ED operations,making planning and managing operations more difficult in emergencies. Related metrics andtargeted values of these metrics can also be identified by collecting data through interviewsand a literature search.\nSince the collected data is raw data, which in its current form is not suitable for analyz-\ning and modeling, different data pre-processing tasks must be performed. It is necessary todefine the input and output variables of the model, define the periodicity (hourly, daily, weekly,monthly, etc.) of the analysis, and determine ways to measure the values of the variables. Datatransformation may also involve measuring the values of the variables. One of the main pre-processing tasks in big data studies is cleaning the data set to remove redundant or inappropri-ate data, missing values, and outliers. After all these tasks have been performed, the structureddata set, which can further be processed with BDA tools and techniques, is obtained.\nOnce the structured data set of the model is ready, the modeling step comes next. The\nobtained data set is split into two train and test sets. Train data sets include the values ofall the input and output variables, whereas since the test data set will be used to evaluatethe model’s prediction accuracies, it does not include the values of the output variables. The\n123\nAnnals of Operations Research (2023) 328:1073–1103 1079\ntrain data set is further processed with machine learning as one of the most widely used BDA\ntechniques. Machine learning presents algorithms to extract knowledge and make efficientdecisions by learning from given data sets. Researchers widely prefer these algorithms basedon their flexibility in using data to capture complex and non-linear behaviors (Choi et al.,2018 ). Among various machine learning algorithms, MLP neural networks have received\nsignificant attention since these are appropriate and efficient for function approximation,pattern classification, and prediction. Incorporating hidden layers between input and outputlayers is one of the other parser properties of these algorithms. When required by extendingthe number of hidden layers, MLP neural networks can expand the number of input featurecombinations to improve the model’s learning ability, finally increasing the prediction power.Although many other BDA techniques have been widely implemented in the literature, themachine learning-based MLP neural network algorithm is integrated into the proposed modelbased on these properties and superiorities.\nThe testing and model evaluation step comes next in the proposed model. The obtained\nMLP algorithm with the optimized parameters is applied to the test data set to get the predictedvalues of the output variables of interest. The predicted values are then compared with theactual values, and the mean errors and accuracies of the prediction should be calculated. Theseperformances should then be compared with the target values. If the targets are achieved orthe model performance goes beyond the targeted one, the model can be proposed for real-life applications. The results on the significance and impacts of government restrictions andprohibitions may also be discussed in detail, and implications should be recommended topolicymakers. Suppose the model performance cannot achieve the targets. In that case, it isnecessary to go back to the data pre-processing step and re-define the model input and outputvariables. The modeling, testing, and evaluation steps must be repeated until proper modelshave been obtained. The proposed model is shown in Fig. 2.\nFig. 2 Flowchart of the proposed model\n123\n1080 Annals of Operations Research (2023) 328:1073–1103\n4 Case study\n4.1 Case study specification\nWe collected the data set of this study from an ED of a research and training hospital located\nin a metropolitan region in Izmir, Turkey. The daily number of patients or visits to this EDis more than 1,000. This huge patient volume is due to several reasons. First, as mentionedpreviously, overcrowding is a common problem in EDs. Second, due to the vast volumes ofnon-urgent patient visits, this problem can be more severe in some countries, such as Turkey,compared to many other countries. Third, many patients may choose to be treated in thishospital due to its type. Fourth, since this is a public hospital, receiving service from EDs isfree of charge. Fifth, since it is located in a metropolitan region and is very close to publictransport stations and the city center, it is also easily accessible for ambulances. Sixth butnot least, since this ED provides uninterrupted service (7 days and 24 h) while many of theother departments of this hospital provide service only within working hours on weekdays,this causes additional visits of patients of different departments to EDs out of the workinghours. These characteristics created huge volumes, velocities, and varieties in the data set.\nIn Turkey, the first COVID-19 case was reported on March 10, 2020, in Istanbul city, and\nthe virus then spread quickly to the whole country. In Turkey, the COVID-19 was encounteredlater than in many other countries. Thus, public awareness had already been created aboutthis virus and the pandemic. Public awareness was a crucial initial step in coping with thisvirus. Since it first appeared in Turkey, the government started announcing policies like\"social distancing,\" \"hygiene,\" and \"stay at home.\" However, raising public awareness fromthe outset and making announcements was not enough to prevent the spread of the virus.Then, the government imposed other types of restrictions and prohibitions. Restrictions forthe elderly, inter-city transport bans and restrictions for the young were imposed startingfrom the end of March. In addition, starting from the middle of April, total curfews wereimposed at weekends (for two days) and for extended weekends in some of the weeks, whichcould last up to three or four days. The number of cases and deaths started to fall by May.Then the period of normalization began at the beginning of June. Although restrictions andprohibitions were still in use during this month, they were more relaxed.\nHaving high volumes, velocities, and varieties in patient sizes and characteristics, the\nselected ED was identified as proper for this study’s theoretical framework and methodology.Besides, since in different periods (such as before March and during April) and days (suchas weekdays and weekends), government-imposed actions were highly changing during thestudy period, the case ED allowed to investigate the impact of these actions on ED operations.\n4.2 Data set characteristics\nThe data set covers seven months, from December 2019 to June 2020, and includes 238,152patients. Data from between March 10 to the end of June 2020 represents data collected duringthe period of COVID-19’s first peak in Turkey. To have a similar number of days before theCOVID-19 period, the related data set was started in December 2019. Thus, before COVID-19 and during COVID-19 periods cover around 3.5 months of data. For each arriving patient,records of the ED case include the following information: patient ID, gender, age, arrivaltype, triage level, date of arrival, time of arrival, diagnostic tests for treatment-if required,related times for diagnostic tests, assigned diagnosis type by a doctor after treatment, andtime of departure. The patient ID is unique for each patient arrival. Gender is recorded as\n123\nAnnals of Operations Research (2023) 328:1073–1103 1081\nmale and female. Age is recorded as it is in a continuous form. The arrival type represents\nif a patient arrived by themselves or by ambulance, so it is recorded as one of two options:\"walk-in\" or \"by ambulance.\" When a patient comes to this ED, they are first met by a triagenurse, who triages the patient based on his complaints and clinical acuities. This ED uses the3-level Emergency Severity Index for patient triage.\nFurthermore, trauma patients are treated in a different zone. Thus, arriving patients are\nassigned to one of four zones labeled green, yellow, red, and trauma zones. The arrivaldate represents the full date of the patient’s arrival in a day, month, and year form. Time ofarrival shows the exact time of arrival in an hour, minute, and second form. Many diagnostictests can be ordered in EDs for patient diagnosis. The label of the requested test, and therelated ordering time, approval time, and result time are recorded in the next three rows inan hour, minute, and second form. When doctors diagnose the patients, they assign the typeof diagnosis based on the International Classification of Diagnosis 10th version (ICD-10).Thus, the diagnosis cell includes the diagnosis based on the ICD-10 codes, which can have22 different categories. The last cell consists of the departure time of the patient in an hour,minute, and second form.\nThe data set includes additional attributes to represent government restrictions and prohi-\nbitions. The four main restrictions and prohibitions imposed in Izmir city are considered inthe proposed models. During the COVID-19 study period, total curfew (lockdowns), curfewfor the young (age ≤20), curfew for the elderly (age ≥65), and transport bans were imposed.\nThese are also adopted in the proposed models as model input variables, as discussed in thenext section on data pre-processing.\nAs presented in Fig. 2, selecting the study variables is an important initial step of the\nproposed model. However, it should be kept in mind that these variables are not fixed andrigid and may depend on the selected case studies. Different variables may define the system’sinternal and external dynamics for other cases.\n4.3 Data pre-processing\nWe implement the proposed model with four different ED operations to investigate how theimposed policies have changed and affected the primary operations and resource usage. Thefirst and second operations, Operation 1 and Operation 2, respectively predict the daily num-ber of patients arriving and the average LOS of these patients (LOS is defined as the timebetween the patient’s arrival and their departure) for each day during-COVID-19 period.Different diagnostic tests can be mainly grouped into either laboratory tests or radiologicimaging tests. Thus, we also implement the model for two other operations to analyze theprimary resource usage. Operation 3 and Operation 4 predict daily numbers of ordered labo-ratory tests and radiologic imaging tests for diagnosing patients. Regarding output variablesor attributes of the model for each operation, these are defined adequately as the daily numberof patients, average daily LOS of patients, the daily number of laboratory tests ordered, andthe daily number of radiologic imaging tests ordered during-COVID-19 period.\nSince the aim is to model and manage related daily values, the data set was initially trans-\nformed. In this process, we eliminated the repetitive values from the data set. More than oneICD-10 encoded diagnosis can be assigned to a patient. Different laboratory tests (hemogram,biochemistry, enzyme, hormone, etc.) or radiologic imaging tests (X-ray, tomography, ultra-sound, magnetic resonance imaging, etc.) can also be ordered for a patient with a unique ID.While obtaining the corresponding daily value of the models, we eliminated these repetitiveor redundant values.\n123\n1082 Annals of Operations Research (2023) 328:1073–1103\nBesides the policy-based attributes, some other input variables were also defined to adopt\nthe system characteristics in the proposed models. These variables were used to represent thesystem dynamics in normal circumstances. Previous studies showed that the day of the weekhas a significant effect on patient volume and LOS (Sarıyer et al., 2020 ). Existing literature\nalso presented that the patient volume, LOS, and numbers of diagnostic tests ordered differedsignificantly between categories of demographic variables (Sarıyer & Ataman, 2020 ). We,\ntherefore, identified these factors as internal factors to represent the ED environment in normalcircumstances. To measure the values of these inputs, we used the study’s data set coveringthe before-COVID-19 period. As in output variables, we made the required transformationsto obtain the daily values of these input variables. The data set is described in Table 1.\nWe performed data pre-processing by dropping missing values in the dataset by using the\ndropna() function of the pandas module in Python. After this, based on standardization, we\nremoved the outliers from the data set by using the zscore() function of the pandas module\nTable 1 Definitions and measurement scales of the model variables\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\n1: Managing daily\nnumbers of patientY1: The daily number of\npatients arriving eachd a yi nt h eduring-COVID-19study period(numerical)X1: The average daily\nnumber of patientsarriving for each day ofthe week—Mondaythrough to Sunday(numerical)Representing govern-\nment restrictions andprohibitions\nX2: The whole curfew\nexists in the day to bepredicted or not (cat-egorical)\nX3: Curfew for young\nexists in the day to bepredicted or not (cat-egorical)\nX4: Curfew for the\nelderly exists in theday to be predictedor not (categorical)\nX5: Transport ban\nexists in the day tobe predicted or not(binary)\n2: Managing daily\naverage LOS ofpatientsY2: Average daily LOS\nof patients arriving eachd a yi nt h eduring-COVID-19study period(numerical)X7-X8: average daily\nLOS of female-malepatients for each day ofthe week (numerical)\nX9-X10-X11: Average\ndaily LOS of agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\n123\nAnnals of Operations Research (2023) 328:1073–1103 1083\nTable 1 (continued)\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\nX12 through X15:\nAverage daily LOS oftriage groups—red,yellow, green, traumazones—for each day of\nthe week (numerical)\nX16 through X37:\nAverage daily LOS ofICD-10 encodeddiagnosis, for 21\ngroups\n*, for each day\nof the week(numerical)\n3: Managing daily\nnumbers of ordered\nlaboratory testsY3: The daily number of\nlaboratory tests ordered\nin the\nduring-COVID-19study period(numerical)X38-X39: Average daily\nnumbers of laboratory\ntests ordered for\nfemale-male patientsfor each day of theweek (numerical)\nX40-X41-X42: Average\ndaily numbers of\nlaboratory testsordered for agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\nX43-X44: Average daily\nnumbers of laboratorytests ordered for arrivaltype groups—byambulance orwalk-in—for each dayof the week(numerical)\nX45 through X48:\nAverage daily numbersof laboratory testsordered for triagegroups; red, yellow,green, trauma zones,for each day of theweek (numerical)\n123\n1084 Annals of Operations Research (2023) 328:1073–1103\nTable 1 (continued)\nOperation Defined output variables\n(symbol, definition, scale)Operation-specific input\nvariables representingsystem dynamics(symbol, definition,scale)Common input vari-\nables(symbol, definition,scale)\nX49 through X69:\nAverage daily numbersof laboratory testsordered for ICD-10encoded diagnosis, for\n21 groups\n*, for each\nday of the week(numerical)Representing system\ndynamics\nX1-fcast: Predicted\ndaily number ofpatients with Model\n1 on each day\nduring-COVID-19study period(numerical) –used in2\nnd,3rd,a n d4th\noperations modeling\n4: Managing daily\nnumbers of orderedradiologic imagingtestsY4: The daily number of\nradiologic imaging testsordered in theduring-COVID-19study period(numerical)X70-X71: Average daily\nnumbers of radiologicimaging tests orderedfor female-malepatients for each day ofthe week (numerical)\nX72-X73-X74: Average\ndaily numbers ofradiologic imagingtests ordered for agegroups—[0–14],[15–64], ≥65—for\neach day of the week(numerical)\nX75-X76: Average daily\nnumbers of radiologicimaging tests orderedfor arrival typegroups—by ambulanceor walk-in—for eachday of the week(numerical)\nX77 through X80:\nAverage daily numbersof radiologic imagingtests ordered for triagegroups—red, yellow,green, traumazones—for each day ofthe week (numerical)\nX81 through X101:\nAverage daily numbersof radiologic imagingtests ordered forICD-10 encoded\ndiagnosis, for 21\ngroups\n*, for each day\nof the week(numerical)\n123\nAnnals of Operations Research (2023) 328:1073–1103 1085\nin Python. We initiated the categorical conversion of the input variables with the Categorical\nclass initializer of the pandas module in Python. We used the Categorical class to encode\nnumerical values as categorized by the capability of initializing the corresponding variableswith categorical values. After these pre-processing steps, we obtained the structured data setfor further modeling with the MLP neural network.\nAs seen in Table 1, we identified the government policies as common input variables in each\noperation to analyze their effects on each of the defined output variables for the correspondingoperations. However, once we predicted the daily number of patients in Operation 1, we usedthese predictions to describe system characteristics in all other models. The daily number ofpatients may affect the average daily LOS, and the number of each diagnostic test ordered.\n5 Results\n5.1 Descriptive results\nThe study period covering the before-COVID-19 period included 100 days of data, andthe total number of patients arriving during these days was 158,347. Laboratory tests wereordered for 29,953 of these patients and 43,106 radiologic imaging tests. On the other hand,the study period covering the during-COVID-19 period included 113 days of data, and thetotal number of patients arriving during these days was 79,805. The number of laboratoryand radiologic imaging tests ordered during this period was 25,154 and 31,488. The averagedaily LOS was 117.53 min in the before-COVID-19 period and 165,03 min in the during-COVID-19 period. Daily values for the number of patients, average LOS, and numbers ofeach type of diagnostic test ordered in the whole study period are depicted in Fig. 3.\nThese results show that while daily and total numbers of patients and diagnostic tests\nordered sharply decreased, average LOS values increased during the during-COVID-19period compared to before-COVID-19. However, although decreases are seen in three ofthe operations’ output variables (1, 3, 4), the sharpest decline was seen in Operation 1’s out-put, the daily number of patients. The decrease in patient numbers may have also caused thedecline in the number of tests ordered. On the other hand, it should be noted that, althoughpatient and diagnostic test numbers decreased, average LOS values increased. All these criti-cal numerical findings could be due to the change in the system dynamics, which were mainlycaused by patients who occupied EDs unnecessarily and did not need an emergency service.\nWe categorized the patients into three groups to support this idea by numerical findings\nconsistent with our model boundaries and comparatively presented the related statistics for\nFig. 3 Daily values of the models’ output variables in the study period\n123\n1086 Annals of Operations Research (2023) 328:1073–1103\neach of these. These categories were: patients requiring no diagnostic tests, laboratory tests,\nand radiologic imaging tests. Since diagnostic tests are one of the most critical resources fordiagnosing patients, we believe most patients for whom no tests are ordered can representthe cases that occupy EDs for non-urgent conditions.\nFor these categories, the average daily numbers of patients and their average LOS are\nshown for each day of the week before-COVID-19 and during-COVID-19 periods in Fig. 4.\nFigure 4shows that while average daily values for patient numbers decreased in each of the\nthree categories in the during-COVID-19 period compared to the before-COVID-19 period,the majority of the decrease is related to the category of patients requiring no diagnostic test.Although it is worth noting that reductions were seen in the number of patients requiring nodiagnostic test, some increases were seen in their average LOS values in the during-COVID-19 period. This finding mainly supports our hypothesis. On the other hand, at least some\nFig. 4 Daily average patient numbers and LOS values for each day of the week\n123\nAnnals of Operations Research (2023) 328:1073–1103 1087\ndecreased levels were observed in the average LOS values of patients requiring diagnostic\ntests during the pandemic period. This could be due to the decreases in resource utilization.When resource utilization decreases, it accelerates access to resources and enables moreefficient use. Based on the daily distributions of patient numbers, one other finding should benoted. In the patients requiring no diagnostic test category, while Saturdays and Sundays, thatis, the weekend, had the highest daily patient numbers compared to weekdays in the before-COVID-19 period, daily numbers were the highest on Mondays in the during-COVID-19period. The impact of government restrictions and prohibitions on ED operations is directlyseen in this finding. Since most of the weekends, total curfews were imposed during thisperiod, patient volume, particularly in the patients requiring no diagnostic test category,sharply decreased at weekends.\nTable 2shows the total number of patients arriving at this ED based on the categories of\nthe considered demographics (gender, age, triage, arrival types, diagnosis) for the before-and during-COVID-19 study periods comparatively.\nFrom the values of Table 2, it should be seen that the distribution of patient numbers\nbased on gender changed in the during-COVID-19 period compared to the before-COVID-19period, as the number of male patients increased. Differences were also depicted based on agedistributions. For each of the three categories, in the young group, age:[0–14], patient numbersand distributions sharply decreased in the during-COVID-19 period, and in the elderly group,age≥65. In contrast, distributions fell in the patients requiring diagnostic tests category\noverall. There was some increase in this age category. Additionally, for all three types,the distribution of patients arriving by ambulance increased in the during-COVID-19 studyperiod. Another important finding showed that, while distributions of green zone patientssignificantly decreased in the patients requiring no diagnostic test category, the distributionof green zone patients increased in some other categories. Finally, significant differenceswere observed between 22 different ICD-10 encoded diagnosis types on the distributions ofthe four main groups. These ICD-10 codes were J00-J99 (disease of the respiratory system),M00-M99 (disease of musculoskeletal system and connective tissue), R00-R99 (symptoms,signs, and abnormal clinical and laboratory findings, not elsewhere classified), and U00-U85(codes for special purposes, COVID-19 here). The significant differences in the distributionsof these diagnosis types are associated with the COVID-19 pandemic and the season.\n5.2 Model results\nThe proposed model was implemented in the obtained data sets of the corresponding casestudy. Since we focus on four primary ED operations, the model was tested repetitively fourtimes for Operations 1 through 4, which increased the model’s validity.\nIn this section, the relation between the identified input variables and the corresponding\noutput variables for each ED operation of interest will be presented based on the results ofthe Pearson correlation analysis. The statistical association between the model variables ispresented in a heat-map structure in the Appendix for each operation. In Table 3,w es h o w e d\nthe direction, magnitude, and significance level of the relationships, notably the significantinput variables of the model for each operation.\nFrom the values of Table 3, it is observed that the defined input variables of Operation\n1, X1 through X5, were all significantly related to the output variable Y1. Besides, therelations were in a negative direction. This demonstrates how policy-based restrictions andprohibitions reduce the predicted number of daily patients in the during-COVID-19 period.Nonetheless, while it is observed that the system dynamics related to input variable X1 had a\n123\n1088 Annals of Operations Research (2023) 328:1073–1103\nTable 2 Distributions of each patient demographic variable for three categories in the before- and during-\nCOVID-19 periods\nVariable Levels Patients requiring no\ndiagnostic testPatients requiring\nlaboratory testsPatients requiring\nradiology tests\nBefore During Before During Before During\nn (%) n (%) n (%) n (%) n (%) n (%)\nGender Female 47,670\n(47.742)14,784\n(42.444)16,636\n(55.540)12,407\n(49.324)21,894\n(51.177)14,899\n(47.316)\nMale 52,179\n(52.258)20,048\n(57.556)13,317\n(44.460)12,747\n(50.676)20,887\n(48.823)16,589\n(52.684)\nAge age: [0–14] 20,722\n(20.753)3,683\n(10.574)4,726\n(15.778)1,715\n(6.818)7,991\n(18.679)2,951\n(9.372)\nage:\n(15–64)70,980\n(71.087)27,538\n(79.059)17,730\n(59.193)18,310\n(72.792)26,814\n(62.677)23,309\n(74.025)\nage≥65 8,147\n(8.159)3,611\n(10.367)7,497\n(25.029)5,129\n(20.390)7,976\n(18.644)5,228\n(16.603)\nTriage\nlevelgreen room 68,335\n(68.438)12,122\n(34.801)2,624\n(8.760)6,490\n(25.801)7,746\n(18.106)7,279\n(23.117)\nyellow\nroom23,212\n(23.247)14,888\n(42.742)20,542\n(68.581)12,037\n(47.853)21,406\n(50.036)12,280\n(38.999)\nred room 2,313\n(2.316)2,076\n(5.960)5,904\n(19.711)5,737\n(22.808)4,833\n(11.297)4,950\n(15.720)\ntrauma\nroom5,989\n(5.998)4,904\n(14.079)883\n(2.948)890\n(3.538)8,796\n(20.561)6,979\n(22.164)\nArrival\ntypewalk in 98,553\n(98.702)33,148\n(95.165)24,508\n(81.822)19,224\n(76.425)37,374\n(87.361)25,642\n(81.434)\nby ambu-\nlance1,296\n(1.298)1,684\n(4.835)5,445\n(18.178)5,930\n(23.575)5,407\n(12.639)5,846\n(18.566)\nICD-10\nencodeddiagno-sisA00-B99 3,095\n(3.100)755\n(2.168)241\n(0.805)193\n(0.767)156\n(0.365)96 (0.305)\nC00-D49 32\n(0.032)24\n(0.069)49\n(0.164)31\n(0.123)43\n(0.101)24 (0.076)\nD50-D89 135\n(0.135)139\n(0.399)75\n(0.250)88\n(0.350)37\n(0.086)51 (0.162)\nE00-E89 108\n(0.108)122\n(0.350)131\n(0.437)117\n(0.465)74\n(0.173)81 (0.257)\nF01-F99 696\n(0.697)515\n(1.479)223\n(0.744)183\n(0.728)132\n(0.309)\n124\n(0.394)\nG00-G99 1,211\n(1.213)540\n(1.550)335\n(1.118)221\n(0.879)415\n(0.970)277\n(0.880)\nH00-H59 646\n(0.647)453\n(1.301)10\n(0.033)7 (0.028) 12\n(0.028)6 (0.019)\n123\nAnnals of Operations Research (2023) 328:1073–1103 1089\nTable 2 (continued)\nVariable Levels Patients requiring no\ndiagnostic testPatients requiring\nlaboratory testsPatients requiring\nradiology tests\nBefore During Before During Before During\nn (%) n (%) n (%) n (%) n (%) n (%)\nH60-H95 1,541\n(1.543)576\n(1.654)61\n(0.204)36\n(0.143)63\n(0.147)46 (0.146)\nI00-I99 1,113\n(1.115)730\n(2.096)1,192\n(3.980)857\n(3.407)959\n(2.242)715\n(2.271)\nJ00-J99 36,073\n(36.128)5,368\n(15.411)3,174\n(10.597)5,223\n(20.764)4,427\n(10.348)4,913\n(15.603)\nK00-K95 3,925\n(3.931)1,789\n(5.136)1,580\n(5.275)935\n(3.717)1,184\n(2.768)753\n(2.391)\nL00-L99 1,384\n(1.386)1,154\n(3.313)69\n(0.230)67\n(0.266)38\n(0.089)47 (0.149)\nM00-M99 13,190\n(13.210)7,625\n(21.891)2,459\n(8.210)1,933\n(7.685)14,039\n(32.816)8,924\n(28.341)\nN00-N99 2,050\n(2.053)1,206\n(3.462)2,434\n(8.126)1,562\n(6.210)1,673\n(3.911)1,195\n(3.795)\nO00-O9A 28\n(0.028)25\n(0.072)17\n(0.057)18\n(0.072)54\n(0.126)28 (0.089)\nP00-P96 49\n(0.049)51\n(0.146)50\n(0.167)39\n(0.155)5 (0.012) 4 (0.013)\nQ00-Q99 3 (0.003) 5 (0.014) 4 (0.013) 5 (0.020) 5 (0.012) 6 (0.019)\nR00-R99 11,797\n(11.815)3,544\n(10.175)13,110\n(43.769)7,599\n(30.210)12,321\n(28.800)6,957\n(22.094)\nS00-T88 2,556\n(2.560)1,790\n(5.139)193\n(0.644)179\n(0.712)632\n(1.477)537\n(1.705)\nU00-U85 0 (0.000) 644\n(1.849)0 (0.000) 2,106\n(8.372)0 (0.000) 1,971\n(6.260)\nV00-Y99 1,448\n(1.450)1,286\n(3.692)517\n(1.726)426\n(1.694)1,509\n(3.527)801\n(2.544)\nZ00-Z99 18,769\n(18.797)6,491\n(18.635)4,029\n(13.451)3,329\n(13.234)5,003\n(11.694)3,932\n(12.487)\nsignificant relation with the model output variable, the relations of the policy-based variables,\nparticularly X5, X2, and X3, were more substantial. However, for Operation 2, we observedthat most of the selected input variables were not significantly related to Y2. We observedthat only X1-fcast and X5 were related considerably to Y2. As also seen in Table 3,m o s to f\nthe selected input variables of the model were significant while modeling Operations 3 and4. We also observed that some of the selected policy-based variables had significant negativerelations with Y3 and Y4. This result demonstrated that such policies caused substantialdecreases in resource usage of EDs during-COVID-19 period.\nAfter analyzing the effects of the identified input variables on the operations, we further\nprocessed the obtained data sets using the MLP neural networks. MLPRegressor in the neuralnetwork package of the sklearn module in Python was initialized to process the data sets of the\nmodels. The solver function of the algorithm chosen was adam() and the activation function\n123\n1090 Annals of Operations Research (2023) 328:1073–1103\nTable 3 Correlation results for significant input parameters of the model for each of the operations\nModeling daily patient\nnumbers: Operation 1Modeling average\ndaily LOS: Operation2Modeling daily numbers\nof ordered laboratorytests: Operation 3Modeling daily\nnumbers of orderedradiologic imagingtests: Operation 4\nrY1−X1=-0.25**\nrY1−X2=-0.40**\nrY1−X3=-0.43**\nrY1−X4=-0.22*\nrY1−X5=-0.76**rY2−X1−fc a s t =\n0.18*\nrY2−X5=− 0.29**rY3−X1−fc a s t =\n0.39**\nrY3−X2=-0.36**\nrY3−X38=0.24**\nrY3−X39=0.33**\nrY3−X40=0.19*\nrY3−X41=0.27**\nrY3−X43=0.29**\nrY3−X46=0.33**\nrY3−X49=0.28**\nrY3−X51=0.39**\nrY3−X53=0.19*\nrY3−X55=− 0.28**\nrY3−X58=0.37**\nrY3−X59=0.22\nrY3−X62=− 0.24**\nrY3−X63=− 0.28**\nrY3−X64=− 0.38**\nrY3−X66=0.27**\nrY3−X67=0.39**rY4−X1−fc a s t =\n0.87**\nrY1−X2=-0.42**\nrY1−X3=-0.31**\nrY1−X5=-0.66**\nrY1−X70=0.34**\nrY1−X71=0.36**\nrY1−X72=0.42**\nrY1−X73=0.38**\nrY1−X74=0.30**\nrY1−X75=0.42**\nrY1−X77=0.26**\nrY1−X78=0.40**\nrY1−X79=0.32**\nrY1−X80=0.28**\nrY1−X82=0.32**\nrY1−X83=0.30**\nrY1−X87=0.25**\nrY1−X88=0.19*\nrY1−X90=0.36**\nrY1−X91=0.32**\nrY1−X92=-0.30**\nrY1−X93=0.37**\nrY1−X95=− 0.20*\nrY1−X98=0.30**\nrY1−X99=0.24**\n*Correlation is significant in 95%CI\n**Correlation is significant in 99%CI\nselected was relu() . The train test split was used for experimentation, and the separation was\napplied randomly. The train/test split value of 0.8 was applied. The experiment was repeatedseveral times to obtain the optimal model parameters for learning rate, momentum, and thenumber of hidden layers. The prediction performances of the models were tested on the testdata sets based on the mean absolute percentage error (MAPE), and the root mean squareerror (RMSE) statistics. The optimal model parameters specific to each model and modelperformances are represented in Table 4.\nTable 4shows that the proposed model performs well for managing ED operations in\nthe COVID-19 periods. The model, tested in four different operations, achieved around 90%accuracy in two of these operations and 95% accuracy in one. On the other hand, in one of the\n123\nAnnals of Operations Research (2023) 328:1073–1103 1091\nTable 4 MLP neural network performances on ED operations predictions during-COVID-19\nED operations during-COVID-19 and\nrelated modelOptimized parameters (learning\nrate-LR, momentum-M, number ofhidden layers-HLModel\nperformance\nMAPE RMSE\nModelling daily patient numbers:\nOperation 1LR=0.01, M =0.01, HL =2 10.573 88.624\nModelling daily average LOS:\nOperation 2LR=0.5, M =0.2, HL =3 19.309 40.473\nModelling daily numbers of ordered\nlaboratory tests: Operation 3LR=0.001, M =0.125, HL =4 9.884 28.325\nModelling daily numbers of ordered\nradiologic imaging tests: Operation 4LR=0.019, M =0.19, HL =3 5.924 20.324\noperations modeling average daily LOS, the model performance was lower, having around\n80% accuracy. The model results are also consistent with the findings on the relationshipbetween model attributes. Since lower relations were observed between variables on LOSmodeling, prediction performance could not achieve the modeling performances on otheroperations with higher correlation levels between the variables. Nonetheless, the achievedaccuracies were still acceptable and practically implementable compared with related studiesand targeted levels.\n6 Discussion\nThis study emphasizes implementing emerging technologies, particularly BDA, in manag-ing health services’ operations. As noted in the literature (Akter & Wamba, 2019 ; Donthu &\nGustaffson, 2020 ), we believe that the challenges posed by COVID-19 can be tackled using\nthese technologies. Grounded in dynamic capabilities and the related context of BDAC, weproposed a model for the management of ED operations in emergencies. To show the valid-ity of the proposed model, we tested it in four different primary operations of EDs. Whiledefining the model variables, besides using the system dynamics-related factors, we imple-mented additional variables to represent the effect of government restrictions and prohibitionsimposed to cope with emergencies. Thus, we contribute to the literature by proposing an effi-cient system for managing ED operations in emergencies by implementing emerging BDAtechnologies and investigating the effects of these policy-based factors on ED operations.\nThe model has been validated using real-life data from a large-scale ED operating in\n˙Izmir city, Turkey. Although the overcrowded environments of EDs are a global problem,\nthis problem is worse in some countries, such as Turkey, in which EDs are frequently occupiedunnecessarily by non-emergent patients. By comparing the daily and total patient volumes inthe before- and during-COVID-19 study periods, the descriptive findings on the case data setmainly represent the significance of this problem in this ED since patient volumes sharplydecreased during-COVID-19 period. By classifying patients into three categories—patientsrequiring no diagnostic tests, laboratory tests, and radiologic imaging tests—and identifyingthat the reduction in patient volume was mainly caused by the first category (patients requir-ing no diagnostic tests), we also provide evidence to support this finding. We additionallysupport this finding by observing increases in the average LOS values of patients who do not\n123\n1092 Annals of Operations Research (2023) 328:1073–1103\nrequire any diagnostic tests. Contrarily, the average LOS values were observed to decrease\nfor patients requiring diagnostic tests during-COVID-19 period. All these findings demon-strate that most patients make unnecessary visits to this ED. This result supports the existingstudies reporting a substantial decrease in ED visits during the COVID-19 (Jeffery et al.,2020 ; Schereyer et al., 2020 ). We also contribute to the literature by linking this result to one\nof the biggest operational challenges of EDs and demonstrating that unnecessary visits arethe leading cause of overcrowded ED environments. Besides, from the practical viewpoint,the decrease in patient numbers and diagnostic test orders during COVID-19 may be usedfor hospital managers’ better scheduling and allocation of ED resources. Although a sharpdecline was observed in these values, a significant increase was observed in patients’ averageLOS values, meaning that arriving patients to EDs during-COVID-19 required more andlonger interventions and treatments. Thus, better planning and allocation of ED resourceswill be essential for functioning these services during emergencies.\nSignificant decreases in patient volume during-COVID-19 period may be related to two\nmain factors. First, the pandemic created stress in patients. To protect themselves from beinginfected, they may have avoided visiting EDs if they did not have emergent or urgent sit-uations. Second, due to the government restrictions and prohibitions imposed, people werepartially obliged to stay at home if they did not need an emergent or urgent health service.Since the first factor is more behavioral, it is beyond the scope of this study. However, weaimed to identify the impacts of policy-based factors on ED operations by adopting our modelinto a case study representing the overcrowding of ED environments and frequently unneces-sary ED visits. This result supports the existing studies reporting decreased patient volumesdue to the governmental actions taken in fighting COVID-19 (Kendzerska et al., 2021 ; Sözen\net al., 2022 ). It also enhances literature by considering this effect in developing prediction\nmodels for patient volumes, average stay lengths of patients, and resource utilization of EDsduring this pandemic period.\nThe depicted decreases in the average LOS values of patients requiring laboratory or\nradiologic imaging tests in the during-COVID-19 period compared to the before-COVID-19period highlights another essential finding of this study. While this finding has been widelypresented in the literature (Houshyar et al., 2020 ; Jeffery et al., 2020 ), by proposing an\nefficient data-driven model for predicting the daily utilization of these services during thispandemic, once again, this study differs from the existing studies. As an interpretation, itshould be noted that the decrease in the utilization of EDs’ resources accelerates the accessto resources and enables more efficient use of them, and solves another challenge of longwaiting times in EDs.\nA critical step in devising the proposed model was determining the model inputs appro-\npriately. In the case study implementation, input variables are defined in two categories as(i) variables representing system dynamics and (ii) government restrictions and prohibitions.While policy-based variables are defined commonly in implementing the proposed modelfor considered ED operations, system dynamics-based variables are explicitly defined foreach operation. The primary demographics, such as gender, age, triage level, arrival type,and ICD-10 encoded diagnosis in the ED patients’ database, were used and appropriatelytransformed to identify operation-specific input variables. The values of these variables weremeasured based on the data set for the before-COVID-19 study period.\nAfter forming data sets in this manner, the proposed model was tested for the considered\nED operations of managing the daily number of patients, average daily LOS, daily numbersof laboratory tests ordered, and daily numbers of radiologic imaging tests ordered. Whenthe relations between the specified input variables and the daily number of patients during-COVID-19 period were analyzed, it was concluded that policy-based attributes have more\n123\nAnnals of Operations Research (2023) 328:1073–1103 1093\nsignificant effects on the daily number of patients compared to the identified system dynamics-\nrelated input variables. Some relations were observed between the defined input variables,such as transport bans and restrictions on the elderly, and the daily average LOS during-COVID-19. While policy-based variables, such as total curfew, are related to the daily numberof laboratory tests ordered during-COVID-19 period, some other system dynamics-relatedinput variables also have relations with the corresponding output variable. Finally, bothpolicy-based attributes, namely, curfews and restrictions and transport bans, and most systemdynamics-related variables seemed to relate to the daily number of radiologic imaging testsordered. It is also noted that the depicted correlations between policy-based input variablesand the corresponding output variables had negative signs showing that such policies maydecrease patient volume and the utilization of primary ED resources. From these findings, itis concluded that the restrictions and prohibitions imposed by the government in coping withCOVID-19 have had significant impacts on the management of ED operations. This resultis in line with the existing studies (Akter & Wamba, 2019 ; Haldane & Morgan, 2021 ; Sözen\net al., 2022 ). Our findings contribute to the literature by investigating the effects of system\ndynamics-related and government-imposed actions together and comparatively for differentoperations of EDs.\nThe obtained data sets were then used to implement the proposed model in the four primary\nED operations using MLP neural networks. Neural network algorithms have been presentedin the literature for automatic COVID-19 detection (Qayyum et al., 2021 ) and infection rate\npredictions (Wieczorek et al., 2020 ; Sozen, Sariyer & Ataman, 2021). By implementing this\nalgorithm in multi real-life operations of EDs, the used contexts of this BDA technique havebeen extended in this paper. The model has high prediction accuracies for managing dailypatient numbers and daily use of resources during a pandemic. Besides achieving or exceedingthe prediction performances of models in the literature in this context (Whitt & Zhang, 2019 ),\nthese results achieved the targeted value (85%) set by this ED’s service providers. Althoughthe model’s performance is lower in predicting daily average LOS values, it can still matchthe performance of previous studies (Ataman & Sariyer, 2021) and achieve the targeted valueof 75% accuracy. This operation’s targeted value is smaller than others since modeling LOS ismore complex. Thus, with the proposed model, which utilizes BDA, we believe that even themost challenging health services operations may be managed efficiently, and the difficultiesposed by emergencies can be handled.\n7 Implications\n7.1 Theoretical implications\nThe study underpins the dynamic capability theory in two folds. The emergencies are featuredwith the rapidly changing conditions and parameters. Hence, the data inherent in the crisesexhibits a dynamic feature. Eventually, the properties of the data set are subject to change.Therefore, DC theory arises as an ideal theoretical structure to embrace dynamically changingenvironments caused by emergencies. While such situations cause rapid changes in patientvolumes, varieties, and characteristics, from different viewpoints, the government’s policies,such as restrictions and prohibitions in fighting these situations, create additional modifi-cations in the system environment. For instance, during emergencies caused by pandemicillnesses, volumes of infected patients may significantly increase. The total patient volumein health services may also be decreased due to panic and stress factors created by being\n123\n1094 Annals of Operations Research (2023) 328:1073–1103\ninfected and based on governmental policies such as stay-home warnings and curfews. All of\nthis support how emergencies create dynamically changing environments. This implicationis strengthened by comparing the main features of the health system data before-COVID19and during-COVID19 periods. Hence, the study’s findings state that DC is applicable inemergencies.\nThe second fold of the theoretical implication can be asserted that dynamically changing\nenvironments caused by emergencies affect decision-making processes. As the propertiesof the data set act in a dynamic manner, it forces the decision-making process to be in linewith this rapid change. Even though the big data nature of the data sets stays the same,the time pressure on the decision-makers is higher due to the fast and dynamic change ofdata. Thus, the need for rapid decision-making increases the need for the capabilities relatedto data analytics. Therefore, BDAC is a crucial structure for building the decision-makingmechanism within emergencies. Once again, the study’s findings support this implication byhighlighting the significant changes in patient volumes, demographics (such as distributionson gender, age, triage, arrival type, and diagnosis categories), and diagnostic test requirements(resource usage) between the before and during pandemic periods. Being aware of changes insuch parameters and having capabilities of shaping ED services rapidly in response to thesechanges provide significant advantages in fighting emergencies. Thus, it can be depicted thatBDAC is applicable in emergencies.\nThus, although dynamic capability theory and the recent view of BDAC have been well\npresented in management literature, this study attempts to extend their usage in the healthcontext, particularly under emergencies. By discussing the rapidly changing parameters andfeatures of the health system environments in emergencies, proposing a model highlightinga need for BDAC, and implementing this model in a real-life big data study, this study aimsto contribute to the context of these theories.\n7.2 Managerial implications\nOur main suggestion is that the decision-makers of health services have BDAC and use bigdata sets of their system environments effectively to create meaningful knowledge, whichshould then be turned rapidly into actions. Adopting the system to dynamically changing con-ditions caused by emergencies quickly and efficiently should be achieved by taking advantageof the emerging technologies and by being able to implement these technologies in practicefor planning and managing operations. Based on the results of this study, we showed howthe current emergency, COVID-19, and the government policies change the patient volumes,varieties, and characteristics. Since such changes may significantly affect ED operations, andbecause it is essential to provide rapid responses to these changing situations, it should alsobe noted that understanding and identifying the main factors that impact their operations iscritical. Suppose system-related factors are characterized and appropriately measured, andexternal factors that may arise from the emergencies are carefully followed and identified.All these factors can be collectively used in modeling ED operations by taking advantageof BDA technologies. Hence, the system may function efficiently even in emergencies. Thechallenges arising in the ED environment and posed by emergencies can be easily managed insuch conditions. Based on such models, the managers will be able to make rapid and correctdecisions and adapt the system efficiently to dynamically changing conditions.\nWe also highlight the importance of data recording in health services. Although BDA and\nBDAC are significant technologies and capabilities for health services and particularly emer-gency departments, all these do not make any sense if there exist no data sets to analyze, create\n123\nAnnals of Operations Research (2023) 328:1073–1103 1095\nknowledge, and use in decision making. Therefore, we suggest that the ED decision-makers\nfocus on electronic recording and data storage processes and should not avoid investing inthese processes and systems. Since the quantity and quality of the data allow meaningful andactionable knowledge, the decision-makers should spend time and effort testing the quality ofrecording processes. Assuring the existence of valid and reliable big data sets is the primaryprior condition for an ED decision-maker to take advantage of BDA in fighting against thechallenges and uncertainties posed by emergencies. This is also very important for satisfyingthe sustainable monitoring in ED processes and real-time emergency response applications.\n7.3 Policy implications\nThis study mainly emphasized the overcrowded ED environments and the significance of thisproblem in our ED, even regularly. Based on the findings, we noted that this overcrowdingmight be primarily associated with the redundant use of these services, particularly for patientswho occupy them for non-urgent situations. These types of patients generally perceive EDsas gateways to hospitals. To not make an appointment and wait in line for polyclinic servicesor receive a health service at weekends or nights, as EDs provide a 7/24 service, patientsmay choose to visit EDs. However, providing a timely and efficient service becomes morechallenging in these crowded environments based on limited resources. If ED operationscannot be appropriately managed, patients even in emergent and urgent situations may have towait to be treated, which may have significant consequences. To cope with this overcrowdingproblem, different government actions should be taken.\nThis study also analyzes the effects of government restrictions and prohibitions in coping\nwith emergencies, particularly COVID-19. It should be highlighted that imposing these poli-cies is crucial in emergencies to protect the functioning of EDs. Government policies, suchas curfews (lock-downs), transport bans, and partial restrictions on the elderly or the young,may decrease patient volumes, redundant ED visits, and resource utilization.\nIn today’s era that requires awareness of big data and the related contexts of BDA and\nBDAC, we also advise policymakers to invest in data storage and analysis in governmentagencies. Governments must create awareness of these emerging concepts and technologiesin public institutions. Governments should pay time, effort, and budget to regularly controlthe agencies based on their data storage capabilities, qualities, quantities, and reliabilities.It may be necessary to impose sanctions on institutions deficient in these concepts duringthese controls. Creating high-quality, reliable, and robust data sets in government institutionswill improve more accurate and timely decision-making processes in emergency and routinesituations. This may also help governments integrate sustainability orientation in health careoperations and flexibility for managing emergencies.\n8 Conclusion\nWhile emergencies precisely demonstrate dynamically changing environments, health ser-vices are the main actors in coping with those situations. Governments are another leadingactor; they are the enablers of the system and may impose restrictions and prohibitions toprotect the functioning of health services. We, therefore, propose a model, which is groundedin the dynamic capabilities and related context of BDAC, for managing operations of one ofthe most crucial health services units, namely, EDs, during emergencies. With this model,we aim not only to manage ED operations sustainably but also to investigate the effects\n123\n1096 Annals of Operations Research (2023) 328:1073–1103\nof imposed restrictions and prohibitions on these operations. Besides proposing a generic\nmachine learning integrated model for managing ED operations under emergencies and vali-dating this model for different operations of EDs, taking the governmental actions as the mainfactors of this model and thus showing how they affect these operations is the main contri-bution of this paper. This study also contributes to dynamic capability theory and BDAC byextending their usage for the decision-making processes of one of the most important actorsof health services, EDs, under emergencies. We also believe that the proposed BDA-drivenmodel or more general big data and BDA implementations in real-life operations may helpsatisfy sustainable operations in EDs.\nThe proposed model adopts one of the most popular BDA techniques: multilayer per-\nceptron neural networks. The model is implemented in a real-life data set representing alarge-scale ED with daily patient volumes of more than 1,000. The current COVID-19 pan-demic represents a focused emergency. The model is validated in four different primaryoperations of EDs: managing daily numbers of patients, daily average stays of patients anddaily usage of resources (laboratory services and radiologic imaging services). The predic-tion performance of the proposed model varies between 80 to 95% for the correspondingoperations. This study also showed that policy-based factors might significantly affect EDoperations. Such restrictions and prohibitions may cause sharp decreases in patient volumesand resource utilisations in EDs, which are challenged by overcrowding. Thus, imposingsuch policies is crucial to protect ED functioning in emergencies.\nThe main limitation of this study was that its experimental evaluation was based on data\ncollected from a single case study, and its findings may, therefore, not generalize to emer-gency departments with significantly different patient populations, characteristics, volumes,and varieties. Generalizing these results to other emergency departments with different oper-ational processes, guidelines, and dynamics may also be impossible. Operationally, to ensurerobustness, it is critical to check for variations in patient and system dynamics patternsobserved in this case study to transfer the proposed model to other emergency departments.Future studies should include a broader set of operations, measurements, internal and exter-nal variables, and outcomes from multiple emergency departments to support the robustnessof the proposed model. Finally, we expect that the implementation of deep learning tech-niques can potentially further improve the predictive performance of the proposed model forconsidered operations of EDs.\nAppendix 1\nCorrelation matrices of the identified variables of the models for corresponding ED opera-tions.\nSee Fig. 5.\n123\nAnnals of Operations Research (2023) 328:1073–1103 1097\nFig. 5 Operation 1: Modelling daily numbers of ED patients during COVID-19\n123\n1098 Annals of Operations Research (2023) 328:1073–1103\nAppendix 2\nSee Fig. 6.\nFig. 6 Operation 2: Modelling daily average LOS of ED patients during COVID-19\n123\nAnnals of Operations Research (2023) 328:1073–1103 1099\nAppendix 3\nSee Fig. 7.\nFig. 7 Operation 3: Modelling daily numbers of laboratory tests ordered\n123\n1100 Annals of Operations Research (2023) 328:1073–1103\nAppendix 4\nSee Fig. 8.\nFig. 8 Operation 4: Modelling daily numbers of radiologic imaging tests ordered\nReferences\nAbdel-Basset, M., Chang, V ., & Nabeeh, N. A. (2021). An intelligent framework using disruptive technologies\nfor COVID-19 analysis. T echnological F orecasting and Social Change, 163 , 120431.\nAkter, S., & Wamba, S. F. (2019). Big data and disaster management: A systematic review and agenda for\nfuture research. Annals of Operations Research, 283 (1), 939–959.\nAlinaghian, M., & Goli, A. (2017). Location, allocation and routing of temporary health centers in rural\nareas in crisis, solved by improved harmony search algorithm. International Journal of Computational\nIntelligence Systems, 10 (1), 894–913.\nAtaman, M. G., & Sarıyer, G. (2021). Predicting waiting and treatment times in emergency departments using\nordinal logistic regression models. The American Journal of Emergency Medicine, 46 , 45–50.\nBag, S., Gupta, S., Choi, T. M., & Kumar, A. (2021). Roles of innovation leadership on using big data analytics\nto establish resilient healthcare supply chains to combat the COVID-19 pandemic: A multimethodologicalstudy. IEEE Transactions on Engineering Management .https://doi.org/10.1109/TEM.2021.3101590\n123\nAnnals of Operations Research (2023) 328:1073–1103 1101\nBirkinshaw, J., Zimmermann, A., & Raisch, S. (2016). How do firms adapt to discontinuous change? Bridging\nthe dynamic capabilities and ambidexterity perspectives. California Management Review, 58 (4), 36–58.\nChoi, T. M. (2021). Fighting against COVID-19: What operations research can help and the sense-and-respond\nframework. Annals of Operations Research .https://doi.org/10.1007/s10479-021-03973-w\nChoi, T. M., Wallace, S. W., & Wang, Y . (2018). Big data analytics in operations management. Production\nand Operations Management, 27 (10), 1868–1883.\nDas, S. K., Pervin, M., Roy, S. K., & Weber, G. W. (2021). Multi-objective solid transportation-location problem\nwith variable carbon emission in inventory management: A hybrid approach. Annals of Operations\nResearch .https://doi.org/10.1007/s10479-020-03809-z\nDeloitte. (2020). COVID -19: Managing supply chain risk and disruption . Retrieved November 10,\n2020, from https://www2.deloitte.com/global/en/pages/risk/articles/covid-19-managing-supply-chain-\nrisk-anddisruption.html .\nDonthu, N., & Gustafsson, A. (2020). Effects of COVID-19 on business and research. Journal of Business\nResearch, 117 , 284.\nFeng, Q., & Shanthikumar, J. G. (2018). How research in production and operations management may evolve\nin the era of big data. Production and Operations Management, 27 (9), 1670–1684.\nFortune. (2020). 94% of the F ortune 1000 are seeing coronavirus supply chain disruptions: Report .\nRetrieved November 10, 2020, from https://fortune.com/2020/02/21/fortune-1000-coronavirus-china-\nsupply-chain-impact/ .\nGoli, A., Zare, H. K., Tavakkoli-Moghaddam, R., & Sadeghieh, A. (2019). Hybrid artificial intelligence and\nrobust optimization for a multi-objective product portfolio problem Case study: The dairy productsindustry. Computers and Industrial Engineering, 137 , 106090.\nGoli, A., Khademi-Zare, H., Tavakkoli-Moghaddam, R., Sadeghieh, A., Sasanian, M., & Malekalipour\nKordestanizadeh, R. (2021). An integrated approach based on artificial intelligence and novel meta-\nheuristic algorithms to predict demand for dairy products: a case study. Network Computation in Neural\nSystems, 32 (1), 1–35.\nGuo, M., Zhang, Q., Liao, X., Chen, F. Y ., & Zeng, D. D. (2020). A hybrid machine learning framework for\nanalyzing human decision-making through learning preferences. Omega, 101 , 102263.\nGupta, S., Justy, T., Kamboj, S., Kumar, A., & Kristoffersen, E. (2021). Big data and firm marketing per-\nformance: Findings from knowledge-based view. T echnological F orecasting and Social Change, 171 ,\n120986.\nHaldane, V ., & Morgan, G. T. (2021). From resilient to transilient health systems: The deep transformation of\nhealth systems in response to the COVID-19 pandemic. Health Policy and Planning, 36 (1), 134–135.\nHarvard Business Review. (2020). Coronavirus is proving we need more resilient supply chains . Retrieved\nNovember 5, 2020, from https://hbr.org/2020/03/coronavirus-is-proving-that-we-need-moreresilient-\nsupply-chains .\nHossain, M. K., Thakur, V ., & Mangla, S. K. (2021). Modeling the emergency healthcare supply chains:\nResponding to the COVID-19 pandemic. Journal of Business and Industrial Marketing .https://doi.org/\n10.1108/JBIM-07-2020-0315\nHoushyar, R., Tran-Harding, K., Glavis-Bloom, J., Nguyentat, M., Mongan, J., Chahine, C., Loehfelm, T.\nW., Kohli, M. D., Zaragoza, E. J., Murphy, P. M., & Kampalath, R. (2020). Effect of shelter-in-place\non emergency department radiology volumes during the COVID-19 pandemic. Emergency radiology,\n27(6), 781–784.\nHuang, H., Peng, Z., Wu, H., & Xie, Q. (2020). A big data analysis on the five dimensions of emergency\nmanagement information in the early stage of COVID-19 in China. Journal of Chinese Governance,\n5(2), 213–233.\nJeffery, M. M., D’onofrio, G., Paek, H., Platts-Mills, T. F., Soares, W. E., Hoppe, J. A., Genes, N., Nath, B.,\n& Melnick, E. R. (2020). Trends in emergency department visits and hospital admissions in health caresystems in 5 states in the first months of the COVID-19 pandemic in the US. JAMA internal medicine,\n180(10), 1328–1333.\nKapoor, K., Bigdeli, A. Z., Dwivedi, Y . K., & Raman, R. (2021). How is COVID-19 altering the manufac-\nturing landscape? A literature review of imminent challenges and management interventions. Annals of\nOperations Research .\nhttps://doi.org/10.1007/s10479-021-04397-2\nKendzerska, T., Zhu, D. T., Gershon, A. S., Edwards, J. D., Peixoto, C., Robillard, R., & Kendall, C. E. (2021).\nThe effects of the health system response to the COVID-19 pandemic on chronic disease management:A narrative review. Risk Management and Healthcare Policy, 14 , 575.\nKumar, A., Shankar, R., Choudhary, A., & Thakur, L. S. (2016). A big data MapReduce framework for\nfault diagnosis in cloud-based manufacturing. International Journal of Production Research, 54 (23),\n7060–7073.\n123\n1102 Annals of Operations Research (2023) 328:1073–1103\nKumar, A., Shankar, R., & Aljohani, N. R. (2020). A big data driven framework for demand-driven forecasting\nwith effects of marketing-mix variables. Industrial Marketing Management, 90 , 493–507.\nLee, S. M., & Trimi, S. (2021). Convergence innovation in the digital age and in the COVID-19 pandemic\ncrisis. Journal of Business Research, 123 , 14–22.\nMari´ c, J., Galera-Zarco, C., & Opazo-Basáez, M. (2021). The emergent role of digital technologies in the\ncontext of humanitarian supply chains: A systematic literature review. Annals of Operations Research .\nhttps://doi.org/10.1007/s10479-021-04079-z\nMidya, S., Roy, S. K., & Yu, V . F. (2021). Intuitionistic fuzzy multi-stage multi-objective fixed-charge solid\ntransportation problem in a green supply chain. International Journal of Machine Learning and Cyber-\nnetics, 12 (3), 699–717.\nMishra, D., Gunasekaran, A., Papadopoulos, T., & Childe, S. J. (2018). Big Data and supply chain management:\nA review and bibliometric analysis. Annals of Operations Research, 270 (1), 313–336.\nMondal, A., & Roy, S. K. (2021). Multi-objective sustainable opened-and closed-loop supply chain under\nmixed uncertainty during COVID-19 pandemic situation. Computers & Industrial Engineering, 159 ,\n107453.\nMondal, A., & Roy, S. K. (2022). Application of Choquet integral in interval type-2 Pythagorean fuzzy\nsustainable supply chain management under risk. International Journal of Intelligent Systems, 37 (1),\n217–263.\nPapadopoulos, T., Baltas, K. N., & Balta, M. E. (2020). The use of digital technologies by small and medium\nenterprises during COVID-19: Implications for theory and practice. International Journal of Information\nManagement, 55 , 102192.\nSarkis, J. (2021). Supply chain sustainability: Learning from the COVID-19 pandemic. International Journal\nof Operations & Production Management, 41 (1), 63–73.\nSchreyer, K. E., Daniel, A., King, L. L., Blome, A., DeAngelis, M., Stauffer, K., Desrochers, K., Donahue, W.,\nPolitarhos, N., Raab, C., & McNamara, R. (2020). Emergency department management of the Covid-19\npandemic. The Journal of emergency medicine, 59 (6), 946–951.\nThakur, V ., Mangla, S. K., & Tiwari, B. (2021). Managing healthcare waste for sustainable environmental\ndevelopment: A hybrid decision approach. Business Strategy and the Environment, 30 (1), 357–373.\nTirkolaee, E. B., Goli, A., Ghasemi, P., & Goodarzian, F. (2022). Designing a sustainable closed-loop supply\nchain network of face masks during the COVID-19 pandemic: Pareto-based algorithms. Journal of\nCleaner Production, 333 , 130056.\nQayyum, A., Razzak, I., Tanveer, M., & Kumar, A. (2021). Depth-wise dense neural network for automatic\nCOVID19 infection detection and diagnosis. Annals of Operations Research .https://doi.org/10.1007/\ns10479-021-04154-5\nQueiroz, M. M., Ivanov, D., Dolgui, A., & Wamba, S. F. (2020). Impacts of epidemic outbreaks on supply\nchains: Mapping a research agenda amid the COVID-19 pandemic through a structured literature review.Annals of Operations Research .https://doi.org/10.1007/s10479-020-03685-7\nSanders, N. R., & Ganeshan, R. (2018). Big data in supply chain management. Production and Operations\nManagement, 27 (10), 1745–1748.\nSarıyer, G., & Ataman, M. G. (2020). The likelihood of requiring a diagnostic test: Classifying emergency\ndepartment patients with logistic regression. Health Information Management Journal, 51 (1), 13–22.\nSarıyer, G., Ataman, M. G., & Kızılo˘ glu, ˙I. (2020). Analyzing main and interaction effects of length of stay\ndeterminants in emergency departments. International Journal of Health Policy and Management, 9 (5),\n198–205.\nSözen, M. E., Sarıyer, G., & Ataman, M. G. (2022). Big data analytics and COVID-19: Investigating the\nrelationship between government policies and cases in Poland, Turkey, and South Korea. Health Policy\nand Planning, 37 (1), 100–111.\nSharma, M., Luthra, S., Joshi, S., & Kumar, A. (2020). Developing a framework for enhancing survivability\nof sustainable supply chains during and post-COVID-19 pandemic. International Journal of Logistics\nResearch and Applications, 25 (4–5), 433–453.\nRubbio, I., Bruccoleri, M., Pietrosi, A., & Ragonese, B. (2020). Digital health technology enhances resilient\nbehaviour: Evidence from the ward. International Journal of Operations and Production Management,\n40(1), 34–67.\nTeece, D. J., Pisano, G., & Shuen, A. (1997). Dynamic capabilities and strategic management. Strategic\nManagement Journal, 18\n(7), 509–533.\nTeece, D., Peteraf, M., & Leih, S. (2016). Dynamic capabilities and organizational agility: Risk, uncertainty,\nand strategy in the innovation economy. California Management Review, 58 (4), 13–35.\nVerma, S., & Gustafsson, A. (2020). Investigating the emerging COVID-19 research trends in the field of busi-\nness and management: A bibliometric analysis approach. Journal of Business Research, 118 , 253–261.\n123\nAnnals of Operations Research (2023) 328:1073–1103 1103\nWamba, S. F., Gunasekaran, A., Akter, S., Ren, S. J. F., Dubey, R., & Childe, S. J. (2017). Big data analytics\nand firm performance: Effects of dynamic capabilities. Journal of Business Research, 70 , 356–365.\nWhitt, W., & Zhang, X. (2019). Forecasting arrivals and occupancy levels in an emergency department.\nOperations Research for Health Care, 21 , 1–18.\nWieczorek, M., Siłka, J., & Wo´ zniak, M. (2020). Neural network powered COVID-19 spread forecasting\nmodel. Chaos, Solitons & Fractals, 140 , 110203.\nYu, W., Zhao, G., Liu, Q., & Song, Y . (2021). Role of big data analytics capability in developing integrated\nhospital supply chains and operational flexibility: An organizational information processing theory per-\nspective. T echnological F orecasting and Social Change, 163 , 120417.\nZollo, M., & Winter, S. G. (2002). Deliberate learning and the evolution of dynamic capabilities. Organization\nScience, 13 (3), 339–351.\nPublisher’s Note Springer Nature remains neutral with regard to jurisdictional claims in published maps and\ninstitutional affiliations.\nSpringer Nature or its licensor holds exclusive rights to this article under a publishing agreement with the\nauthor(s) or other rightsholder(s); author self-archiving of the accepted manuscript version of this article issolely governed by the terms of such publishing agreement and applicable law.\n123",
"metadata": {
"filename": "Big data analytics 2022.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\Big data analytics 2022.pdf",
- "file_size": 2950376,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:35.837648",
- "content_length": 86402
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\Big data analytics 2022.pdf",
+ "size": 2950376,
+ "source": "docs_to_import"
+ },
+ "id": "2bca5cca-f44c-4503-bbd0-551892538300"
},
- "932927b3-d9fe-4477-9ec0-bbf37f794ab3": {
- "id": "932927b3-d9fe-4477-9ec0-bbf37f794ab3",
- "content": "[Página 1]\nExpert Systems With Applications 115 (2019) 543–556 \nContents lists available at ScienceDirect \nExpert Systems With Applications \njournal homepage: www.elsevier.com/locate/eswa \nBIGOWL: Knowledge centered Big Data analytics /p82 \nCristóbal Barba-González, José García-Nieto ∗, María del Mar Roldán-García, \nIsmael Navas-Delgado, Antonio J. Nebro, José F. Aldana-Montes \nDepartmento de Lenguajes y Ciencias de la Computación, University of Málaga, ETSI Informática, Campus de Teatinos, Málaga 29071, Spain \na r t i c l e i n f o \nArticle history: \nReceived 5 April 2018 \nRevised 26 July 2018 \nAccepted 14 August 2018 \nAvailable online 23 August 2018 \nKeywords: \nOntology \nBig Data analytics \nSemantics \nKnowledge extraction a b s t r a c t \nKnowledge extraction and incorporation is currently considered to be beneficial for efficient Big Data an- \nalytics. Knowledge can take part in workflow design, constraint definition, parameter selection and con- \nfiguration, human interactive and decision-making strategies. This paper proposes BIGOWL, an ontology \nto support knowledge management in Big Data analytics. BIGOWL is designed to cover a wide vocab- \nulary of terms concerning Big Data analytics workflows, including their components and how they are \nconnected, from data sources to the analytics visualization. It also takes into consideration aspects such \nas parameters, restrictions and formats. This ontology defines not only the taxonomic relationships be- \ntween the different concepts, but also instances representing specific individuals to guide the users in \nthe design of Big Data analytics workflows. For testing purposes, two case studies are developed, which \nconsists in: first, real-world streaming processing with Spark of traffic Open Data, for route optimization \nin urban environment of New York city; and second, data mining classification of an academic dataset on \nlocal/cloud platforms. The analytics workflows resulting from the BIGOWL semantic model are validated \nand successfully evaluated. \n©2 0 1 8 Elsevier Ltd. All rights reserved. \n1. Introduction \nIn accordance with the recent Gartner’s report, 1 an emerging \nchallenge in Big Data is to construct data-driven intelligent appli- \ncations that capture and inject domain knowledge in the analyt- \nical processes, including context and using a standardized format. \nContext refers to all the relevant (meta)-information to support the \nanalysis and to help interpreting its results. This will facilitate the \nintegration (in a standardized way) with third parties’ data, algo- \nrithms, business intelligence (BI) and visualization services. \nThe use of semantics as contextual information will enhance \nthe analytical power of the algorithms, as well as the reuse of \nsingle components in data analytics workflows ( Ristoski & Paul- \n/p82 This work has been partially funded by Grants TIN2014-58304, TIN2017-86049- \nR (Spanish Ministry of Education and Science) and P12-TIC-1519 (Plan Andaluz de \nInvestigación, Desarrollo e Innovación). Cristóbal Barba-González is supported by \nGrant BES-2015-072209 (Spanish Ministry of Economy and Competitiveness). José\nGarcía-Nieto is the recipient of a Post-Doctoral fellowship of “Captación de Talento \npara la Investigación” Plan Propio at Universidad de Málaga. \n∗Corresponding author. \nE-mail addresses: cbarba@lcc.uma.es (C. Barba-González), jnieto@lcc.uma.es \n(J. García-Nieto), mmar@lcc.uma.es (M.d.M. Roldán-García), ismael@lcc.uma.es (I. \nNavas-Delgado), antonio@lcc.uma.es (A.J. Nebro), jfam@lcc.uma.es (J.F. Aldana- \nMontes). \n1 https://www.gartner.com/doc/3656517/adopt-datadriven-approach- \nconsolidating-infrastructure . heim, 2016 ). Therefore, the development of ways to make the do- \nmain knowledge explicit and usable is needed to improve the \ndata processing and analysis tasks. The Semantic Web technolo- \ngies can be used to annotate not only the knowledge domain \nof the data, but also the analytics’ meta-data ( Keet, Ławrynow- \nicz, d’Amato, Kalousis, Nguyen, Palma, Stevens, & Hilario, 2015 ), \nincluding: algorithms’ parameters, input variables, tuning experi- \nences, expected behaviors and taxonomies. This will facilitate the \nreuse and composition of Big Data analytics in a proper manner, as \nwell as to enhance the quality of consumed and produced data. \nIn this regard, ontologies describe concepts, relationships, \nclasses, individuals, formal logic axioms and objects of a particu- \nlar domain ( Gruber, 1995 ). The objects refer to entities and events \n(concepts) in the real world, and their relations represent the se- \nmantic links between these entities. A series of studies have been \nappearing in the last few years, in which ontological approaches \nare suggested to enhance Big Data analytics ( Konys, 2016; Kuiler, \n2014 ). However, they are presented as conceptual frameworks, still \nin an early stage of development, and mostly oriented to the spe- \ncific domain of health system applications. \nThis motivates us to propose an ontology-driven approach to \nsupport knowledge management in Big Data analytics workflows. \nThe proposed ontology is called BIGOWL (BIG data analytics OWL 2 \n2 OWL refers to the Web Ontology Language described in Section 2.1 . \nhttps://doi.org/10.1016/j.eswa.2018.08.026 \n0957-4174/© 2018 Elsevier Ltd. All rights reserved.\n\n[Página 2]\n544 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nontology), which acts as a formal schema for the representation \nand consolidation of knowledge in Big Data analytics. Knowledge \nincorporation is in turn beneficial for an efficient algorithmic per- \nformance, by taking part in operator’s design, parameter selection, \nhuman interactive and decision-making strategies. \nOur scientific hypothesis is as follows: “The semantic annotation \nof Big Data sources, components and algorithms can acts as a link to \ncapture and incorporate the domain knowledge to guide and enhance \nthe analytical processes ”. In addition, the semantic annotation can \nprovide the background for reasoning methods based on axiomatic \nand rule logic recommendations. \nTo test this hypothesis, a semantic model has been gener- \nated, which comprises an RDF 3 (Resource Description Framework) \nrepository that follows the BIGOWL scheme. This repository can be \nqueried by high level algorithms using SPARQL. The goal is to prop- \nerly feed artificial intelligence procedures capable of guiding the \ndesign of Big Data analytics workflows. \nAs a proof-of-concept, we show how BIGOWL can be used to \nguide the design of real-world and academic analytic workflows. \nA first case study consists in optimizing vehicular routes based on \nNew York real-time Open Data about urban traffic (average speeds \nof vehicles, traffic densities, etc.). 4 The data source is managed by \nstreaming processing tasks (Kafka and Spark), after which they are \noptimized (jMetalSP 5 ) and visualized. The second case study is a \nclassification workflow modeled by using the popular Weka 6 li- \nbrary for data mining, as well as the BigML in-cloud service. 7 \nThe main contributions of this study are: \n•The proposed ontology, BIGOWL, has been designed and imple- \nmented for the representation and consolidation of knowledge \nin Big Data analytics. It considers a large and complemented set \nof concepts, attributes and relationships that have been taken \nfrom Big Data ecosystem. \n•A semantic approach has been implemented to annotate (i.e. \nto “semantize”) all the involved meta-data from multiple data \nsources, processing components and analytic algorithms. The \nmeta-data are integrated following the BIGOWL structure and \nstored in a common RDF repository. \n•The semantic model is evaluated in the context of two realis- \ntic use cases: real-time routing calculation in urban traffic and \nclassical classification with decision trees. The proof-of-concept \nlead us to test our initial hypothesis. \nThe remaining of this paper is structured as follows. In \nSection 2 , background concepts and literature overview are pre- \nsented. Section 3 presents current practices in Big Data analyt- \nics. Section 4 describes the semantic model, comprising the on- \ntology, RDF repository, mappings and workflow composition assis- \ntant. Section 5 presents the use case for testing and validation. In \nSection 6 , a series of discussions are included. Conclusions and fu- \nture work are drawn in Section 7 . \n2. Background and related work \nTo make this paper self-contained, this section describes back- \nground concepts in the Semantic Web field. A review of the state \nof the art is also provided to point out the main differences of the \nrelated works with the proposed approach. \n3 RDF in W3C https://www.w3.org/RDF/ . \n4 https://www.data.cityofnewyork.us/Transportation/Real- Time- Traffic- Speed- Data/ \nxsat-x5sa . \n5 http://www.jmetal.sourceforge.net/ . \n6 https://www.cs.waikato.ac.nz/ml/weka/ . \n7 https://www.bigml.com/ . Table 1 \nBasic OWL-DL semantic syntax used to formally define the proposed \nontology. \nDescriptions Abstract syntax DL syntax \nOperators intersection ( C 1 , C 2 , /22c5/22c5/22c5, C n ) C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \nunion ( C 1 , C 2 , /22c5/22c5/22c5, C n ) C 1 /2294C 2 /2294/22c5/22c5/22c5/2293C n \nRestrictions for at least 1 value V from C ∃ V.C \nfor all values V from C ∀ V.C \nR is Symmetric R ≡R −\nClass Axioms A partial ( C 1 , C 2 , /22c5/22c5/22c5, C n ) A /subsetsqequal C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \nA complete ( C 1 , C 2 , /22c5/22c5/22c5, C n ) A ≡C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \n2.1. Background concepts \n•Ontology. In accordance with Noy, McGuinness et al. (2001) , an \nontology provides a formal representation of the real world. \nIt defines an explicit description of concepts in a domain of \ndiscourse (classes or concepts), properties of each concept de- \nscribing various features and attributes of the concept (proper- \nties) and restrictions on properties. Ontologies are part of the \nW3C standard stack of the Semantic Web. 8 An ontology to- \ngether with a set of individual instances of classes constitutes a \nknowledge base and offer services to facilitate interoperability \nacross multiple heterogeneous systems and databases. \n•RDF. Resource Description Framework ( McBride, 2004 ) is a \nW3C recommendation that defines a language for describ- \ning resources on the web. RDF describes resources in terms \nof triples, consisting of a subject, predicate and object. RDF \nSchema (RDFS) ( Staab & Studer, 2013 ) describes vocabularies \nused in RDF descriptions. \n•OWL. The Ontology Web Language is used to define ontolo- \ngies on the Web, which extends RDF and RDFS, but adding a \nvocabulary. From a formal description, OWL is equivalent to a \nvery expressive description logic DL, where an ontology cor- \nresponds to a Tbox ( Gruber et al., 1993 ). In this sense, OWL- \nDL is syntactic description that gives maximum expressive- \nness while retaining computational completeness and decid- \nability ( McGuinness, Van Harmelen et al., 2004 ). In this work, \nwe use OWL-DL syntax summarized in Table 1 to formalize the \nproposed ontology. \n•SPARQL is a query language for easy access to RDF \nstores. It is the query language recommended by \nW3C ( Harris, Seaborne, & Prud’hommeaux, 2013 ) to work \nwith RDF graphs ( Prud, Seaborne et al., 2006 ), then supporting \nqueries and web data sources identified by URIs. \n•SWRL. The Semantic Web Rule Language provides the \nOWL-based ontologies with procedural knowledge, which \ncompensates for some of the limitations of ontology in- \nference, particularly in identifying semantic relationships \nbetween individuals ( Horrocks, Patel-Schneider, Bechhofer, \n& Tsarkov, 2005 ). SWRL uses the typical logic expres- \nsion “Antecedent ⇒ Consequent ”t o represent semantic rules. \nBoth antecedent (rule body) and consequent (rule head) \ncan be conjunctions of one or more atoms written as \n“atom 1 ∧ atom 2 ∧ /22c5/22c5/22c5∧ atom n ”. Each atom is attached to one or \nmore parameters represented by a question mark and a vari- \nable (e.g., ? x ). The most common uses of SWRL include trans- \nferring characteristics and inferring the existence of new indi- \nviduals ( Grosof & Poon, 2004 ). 9 \n8 https://www.w3.org/standards/semanticweb/ . \n9 https://www.w3.org/Submission/SWRL/ .\n\n[Página 3]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 545 \n2.2. Related work \nIn the last decade, there have been appearing a series of stud- \nies in which ontological approaches are defined to express the \nknowledge domain in data mining and optimization algorithms. A \nrepresentative set of these works are compiled in a recent sur- \nvey ( Dou, Wang, & Liu, 2015 ), in which they are organized by \ncategories of algorithms and applications: association rule discov- \nery ( Marinica & Guillet, 2010 ), classification ( Allahyari, Kochut, & \nJanik, 2014 ) and clustering ( Jing, Ng, & Huang, 2010 ). In these ap- \nplications, semantics is used with different objectives, such as: to \nreduce the search space by specifying restrictions, to filter results \nin the post-processing stage, and to annotate the results of data \nmining processes. \nFollowing with this research line, some recent works include \nontologies to guide the processes in machine learning tasks. For \nexample, in Pinto, Scioscia, Loseto, and Ruta (2015) and Roldán- \nGarcía, García-Nieto, and Aldana-Montes (2017) , two different on- \ntologies are used in the classification process to infer incon- \nsistencies between concepts by means of semantic reasoning. \nIn Phan, Dou, Wang, Kil, and Piniewski (2015) , an ontology-driven \ndeep learning model is proposed to predict human behavior. \nIn the field of optimization, an interesting approach has been \nrecently proposed in Yaman, Hallawa, Coler, and Iacca (2017) , \nwhere the ECO ontology is defined to formally represent knowl- \nedge in evolutionary computation algorithms. This ontology can \nbe used for suggesting strategies for solving optimization prob- \nlems. At the same time, an OWL ontology has been pro- \nposed in Li, Yevseyeva, Basto-Fernandes, Trautmann, Jing, and \nEmmerich (2017) to model and systematize the knowledge of \npreference-based multi-objective evolutionary algorithms. These \nontologies are validated in use cases focused on algorithmic and \nparameter selection in academic problems. \nFrom a different point of view, a parallel line of research focuses \non defining ontologies for the semantic annotation of data analytic \nworkflows. The main objective is to model the input and output \nof algorithms involved in data mining and knowledge base discov- \nery (KDD) workflows to generate valid compositions. To this end, \nseveral OWL ontologies such as: KDDONTO ( Diamantini, Potena, & \nStorti ), DMWF ( Kietz, Serban, Bernstein, & Fischer, 2010 ) and KD \n( Záková, Kremen, Zelezny, & Lavrac, 2011 ), were proposed. How- \never, they did not describe the problem domain, or those basic \nconcepts (algorithm, type of analysis, task, dataset, attribute, etc.) \nthat can be combined to define entities or constraints. In fact, \nthese ontologies were not designed with the objective of opti- \nmizing the performance of the data mining algorithms, since they \ndo not offer detail enough to provide support to what is known \nas meta-learning. In Nguyen, Hilario, and Kalousis (2014) , meta- \nlearning is defined as the KDD procedure to improve performance \nin data mining processes, using information collected during the \nexperimentation phase of these algorithms. In this regard, the use \nof semantics is considered not only for the algorithmic composi- \ntion, but also for the improvement of data mining processes, taking \nadvantage of acquired knowledge from past experience. \nIn this context, the EU-FP7 European initiative e-LICO 10 pro- \nposed the DMOP ontology ( Keet et al., 2015 ), which is de- \nfined to support the analytic workflow composition by follow- \ning the standard CRISP-DM ( Shearer, 20 0 0 ). DMOP is used to de- \nfine analytical workflows, as well as to describe algorithms, pa- \nrameters, inputs/outputs and a large amount of meta-data in- \ncluded in typical data mining processes. A step further was taken \nby Kumara, Paik, Zhang, Siriweera, and Koswatte (2015) that use \n10 http://www.e-lico.eu/ . Automatic Service Composition to automate the analytic workflow \ngeneration. \nAs a summary, Table 2 outlines the main features of the related \nwork with regards to the semantic approach proposed here. These \nfeatures consist of specifying whether the existing approaches: fo- \ncus on data mining or optimization, are oriented to Big Data, pro- \nvide proof-of-concepts, align with other ontologies, use OWL/RDF \nin the semantic model and/or describe workflow composition \ntasks. Then, it is possible to identify the actual contributions of the \nproposed semantic model beyond the state of the art, as follows: \n•BIGOWL is conceived to semantically model data analytics in \nBig Data environments. Similarly to other ontologies in the \nliterature, it is oriented to general KDD procedures, although \nconsidering those Big Data ecosystem elements with class in- \nstances, e.g., ontology individuals. \n•It is aligned with the DMOP ontology, which is in turn aligned \nwith CRISP-DM. They have been validated to construct data \nmining workflows. \n•Besides data mining, BIGOWL is also focused on optimization \nalgorithms, although with special interest on covering multi- \nobjective metaheuristics in Big Data environments. \n•The proposed approach is validated on two real-world use- \ncases consisting of classical data mining and streaming data \nprocessing for multi-objective optimization. \n3. Current practices in Big Data analytics \nIn current Big Data technology ecosystems, when facing a spe- \ncific data analytic task, it is usual to support on already existing \ntools. Some of those consist in commercial services often provided \nthrough cloud computing Software-as-a-Service (SaaS), which can \nbe used by no skilled people by means of workflow compositions \n(e.g., Azure ML, Amazon ML, BigML, Data Mining Cloud Frame- \nwork, and Kognitio); other tools are open-source frameworks re- \nquiring skilled users who prefer to program their application using \nmore technical approaches. Additional factors (such as: data for- \nmat, data source, volume and velocity required to analyse data) are \nalso determinant when choosing the proper technology ( Zomaya & \nSakr, 2017 ). Hadoop ecosystem represents the most used frame- \nwork for developing distributed Big Data analytic applications. \nHowever, it is conceived for high skilled users, so even the stan- \ndard workflow composition service of Hadoop (Oozie) requires cer- \ntain programming ability to be properly used. \nBesides technological or commercial aspects, current Big Data \nplatforms still follow the common procedure when facing data an- \nalytics tasks ( ACM-SIGKDD, 2014 ), which comprises typical steps \nof classical KDD: data collection, data transformation, data mining, \npattern evaluation, and knowledge presentation. \nKeeping this in mind, the proposed semantic approach is ori- \nented to general KDD procedures, then leading the underlying \nBig Data technological platform to be semantically annotated with \nclass instances, e.g., individuals in the ontology. \n4. Semantic model \nOne of the main goals in this study is to capture all the needed \nsemantics to guide the smart design of Big Data analytics work- \nflows and to enhance their performance. For this reason, we opted \nto design an OWL 2 ontology to describe analytic algorithms, \ndatasets, problems, and workflows in the Big Data context. \nTo this end, the standard Ontology 101 development pro- \ncess ( Noy & McGuinness, 2001 ) has been followed, which com- \nprises seven steps: \n1. Determine the domain and scope of the ontology . The main scope \nof BIGOWL is data processing and data analytics in Big Data en-\n\n[Página 4]\n546 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 2 \nSummary ontologies’ features. \nFeature/Ontology CRISP-DM KDDONTO PMOEA ECO (Pinto’2015) (Phan’2015) DMWF KD DMOP BIGOWL \nData Mining /check /check /check /check /check /check /check /check \nOptimization /check /check /check \nBig Data environments /check \nProof of concepts /check /check /check /check /check /check /check /check \nAligned to other ontology /check /check \nOWL/RDF /check /check /check /check /check /check /check /check \nWorkflow composition /check /check /check /check \nFig. 1. Overview of the BIGOWL ontology. Continuous arrows refer to subclasses, whereas dotted ones refer to properties. \nvironments. This considers not only classical data analytic pro- \ncedures, but also specific data processing and underlying soft- \nware platform features oriented to Big Data. \n2. Consider reusing existing ontologies . As commented before, the \nproposed ontology is aligned with DMOP, which has been \nsuccessfully validated to construct data mining workflows. \nDMOP is in turn aligned with the foundational ontology \nDOLCE ( Masolo, Borgo, Gangemi, Guarino, & Oltramari, 2003 ) \nand follows the standard CRISP-DM in the definition of data \nmining processes. \n3. Enumerate important terms in the ontology . Important terms \nwere selected from the literature related to Big Data and op- \ntimization. In addition, terms from the ontologies aligned ( Keet \net al., 2015; Yaman et al., 2017 ) were also incorporated. Exam- \nples of such terms are: Component, Workflow, Task, Data, Dat- \naProcessing and Software . \n4. Define the classes and the class hierarchy . We have followed a \ntop-down approach in developing the class hierarchy. This fact \nfacilitates among others, the alignment with DMOP and DOLCE, \nthe design of annotation mappings and the use of a seman- \ntic reasoner. Fig. 1 shows the ontology core classes and hier- \narchy. For instance, the class Component has several subclasses, \nincluding DataAnalysing and DataCollection . Classes modeling al- \ngorithms, components and workflows are aligned with the class \ndmop:DataType . BIGOWL has been developed using Protégé11 \nand OWL 2. \n11 https://protege.stanford.edu/ . 5. Define the properties of classes and slots . With the purpose of \nrelating classes and defining attributes, we have included ob- \nject and data properties. A representative set of properties are \nshown in Table 3 , where the class Component is related to class \nAlgorithm by means of the object property hasAlgorithm . Data \nproperties of class Component are path, author, numberOfInputs \nand numberOfOutputs . \n6. Define the facets of the slots . This step includes the definition of \ncardinality constraints and value restrictions for the ontology’s \nproperties. For example, the range of the property order is re- \nstricted to integer (to specify in which step this task is carried \nout), when the class Task is its domain. \n7. Create instances . Instances or individuals in BIGOWL are \nspecific of the Big Data analytics domain. For exam- \nple, GeneratorDataTraffic is an instance of the class Kafka , \nwhich is a subclass of DataIngestion . The class Kafka has a \nproperty topicKafka (with range “string”) to indicate streams of \nrecords of Apache Kafka 12 services. \n4.1. The BIGOWL ontology \nBIGOWL has been developed following the steps described \nabove, producing 184 classes, 16 object properties (binary re- \nlationships between individuals), 20 data properties (individ- \nual attributes), 488 axioms, 66 individuals and growing. It is \nworth mentioning that classes DM-DataClass ≡DMDataClass and IO- \n12 Data Streaming Processing https://www.kafka.apache.org/ .\n\n[Página 5]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 547 \nTable 3 \nComponent: object and data properties. \nObject properties Description logic \nhasAlgorithm ∃ hasAlgorithm.Thing /subsetsqequal Component \nhasParameter ∃ hasParameter.Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nisConnected ∃ isConnected.Thing /subsetsqequal Algorithm /2294Component /2294Task \nisCorrect ∃ isCorrect.Thing /subsetsqequal Algorithm /2294Component \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nnumberOfInputs ∃ numberOfInputs.Datatype Literal /subsetsqequal Algorithm /2294Component \nnumberOfOutputs ∃ numberOfOutputs.Datatype Literal /subsetsqequal Algorithm /2294Component \npath ∃ path.Datatype Literal /subsetsqequal IO-Class /2294Algorithm /2294Component \nTable 4 \nTask: object and data properties. \nObject properties Description logic \ncompatibleWith ∃ compatibleWith.Thing /subsetsqequal Task /latticetop /subsetsqequal ∀ compatibleWith.Task \nhasComponent /latticetop /subsetsqequal ∀ hasComponent.Component \nisConnected ∃ isConnected.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nData Properties Description Logic \norder ∃ order.Datatype Literal /subsetsqequal Task /latticetop /subsetsqequal ∀ order.Datatype \nClass ≡Data are declared as equivalent (with relation ≡) to align \nwith those classes from other ontologies (DMOP) that describe \nsimilar concepts. We use OWL-DL syntax (see Table 1 ) to formal- \nize the proposed ontology. The complete ontology is developed in \n“bigowl.owl ”fi l e and available in the GitHub repository. 13 \nA representative set of the main classes are described here, to- \ngether with their object and data properties. These classes are: \nComponent, Task, Algorithm, Data , and Workflow . Each class has de- \nfined a set of properties or conditions in order to be conceptual- \nized. That is, an individual that satisfies those properties is consid- \nered to be a member of that class. \n- Component . This class represents each processing step in the \nanalytic workflow. It is used to encapsulate one concrete function- \nality, its parameters and the corresponding inputs and outputs it \nconsiders. The class Component has four subclasses that are ori- \nented to define specific functionalities in typical data analytics pro- \ncessing chains: DataCollection , to connect to data sources; DataPro- \ncessing , to clean, curate, fuse and consolidate data; DataAnalysis , to \nperform the algorithmic function; and DataSink , to represent final \nsteps in the data flow, e.g., store and visualization. Table 3 con- \ntains the object and data properties defined for Component . In ac- \ncordance with these, a component can specify Input classes and \nOutput classes, to define the type of data it is accepting and gener- \nating, respectively. Therefore, a component can connect with other \none if their linking inputs and outputs are compatible among them. \n- Task . A task represents an instance of a component that is \nused in a workflow and can be run. As shown in Table 4 , the class \nTask has similar properties to those of Component , but including \nthe object property compatibleWith , to specify compatibility among \nconnected tasks, and the data property order , which indicates the \nspecific step of execution in which this task is scheduled, in the \nscope of the workflow. A Component is then a template for one or \nmore tasks, which will be used to carry out its specific functional- \nity in a workflow. \n13 URL link https://www.github.com/KhaosResearch/BIGOWL . - Algorithm . This class is devoted to cover all possible kinds It \nhas two main subclasses: DataMiningAlgorithm and OptimizationAl- \ngorithm ; which are used to distinguish between these two fami- \nlies of algorithms. The former one is included in form of equiv- \nalence with the class DM-Algorithm , which is linked from DMOP. \nThis way, all subclasses deriving from this class in DMOP are also \nused in BIGOWL. For the later, i.e., OptimizationAlgorithm , a new hi- \nerarchical classification of classes has been elaborated in this study \nfor the annotation of this family, which comprises: Exact, Heuristic , \nand Metaheuristic algorithms as main subclasses. \nTable 5 includes the object and data properties of Algorithm . \nAmong its main object properties it is worth mentioning: imple- \nments , which is referred to a learning model or search strategy; \nmanages , to annotate the type of data it works; and resolves , which \nis related to the Problem it is oriented to solve. This is a use- \nful mechanism to relate classes Algorithm and Problem , which also \nshare the data property dealWith that indicates the specific fea- \ntures an algorithm should fulfill to deal with a problem. \nIn this regard, the class Problem defines a series of data proper- \nties like: numberOfConstraints, numberOfObjectives, encodedBy , and \nnumberOfVariables , that will lead a future reasoner to recommend \nthe correct algorithm to solve it. These two classes have to be \ndeclared as DisjointWith , in order to avoid future inconsistencies \nwhen querying the annotated data in a workflow. \n- Data . The class Data is devoted to annotate all the data flow- \ning throughout the analytic workflow. It is declared as EquivalentTo \nIO-Class of DMOP. This aligning enables datatypes defined by third \nparties’ ontologies to be contextualized in the analysis. Table 6 \ncontains the main data properties defined for this class, namely: \npath , to annotate the origin of data; and hasDataType , which de- \nfines the relation with class DataType . This last is used to define \nthe type of data, i.e. PrimitiveType (Double, Integer, Boolean, etc.) \nor StructuredType (Graph, Tree, Matrix, Vector, Tuple, etc.). \n- Workflow . It is used to guide the correct orchestration of \nthose tasks involved in a data analysis job. Its main object prop- \nerties are hasTask and hasParameter , which are formally described \nin Table 7 . These properties are used by the workflow to obtain the \nexecution order, as well as the input/output specifications of each\n\n[Página 6]\n548 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 5 \nAlgorithm: object and data properties. \nObject properties Description logic \nhasComponent /latticetop /subsetsqequal ∀ hasComponent.Component \nhasParameter ∃ hasParameter.Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nimplements Transitive Property implements ∃ implements.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ implements.Strategy \nmanages ∃ manages.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ manages.DataType \nresolves ∃ resolves.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ resolves.Problem \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nnumberOfInputs ∃ numberOfInputs.Datatype Literal /subsetsqequal Algorithm /2294Component \nnumberOfOutputs ∃ numberOfOutputs.Datatype Literal /subsetsqequal Algorithm /2294Component \ndealWith ∃ dealWith.Datatype Literal /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ dealWith.Datatype \nTable 6 \nData: object and data properties. \nObject properties Description logic \nhasDataType ∃ hasDataType.Thing /subsetsqequal Parameter /2294Data /latticetop /subsetsqequal ∀ hasDataType.DataType \npath ∃ path.Datatype Literal /subsetsqequal IO-Class /2294Algorithm /2294Component \nTable 7 \nWorkflow: object and data properties. \nObject properties Description logic \nhasTask ∃ hasTask.Thing /subsetsqequal Workflow /latticetop /subsetsqequal ∀ hasTask.Task \nhasParameter ∃ hasParameter Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nisCorrectWorkflow ∃ isCorrectWorkflow.Datatype Literal /subsetsqequal Workflow /latticetop /subsetsqequal ∀ isCorrectWorkflow.Datatype \nnumTasks ∃ numTask.Datatype /subsetsqequal Workflow /latticetop /subsetsqequal ∀ numTask.Datatype \ntask. This information, together with the data properties numTasks \nand isCorrectWorkflow , is then used in reasoning time to check \nwhether the workflow is correctly composed or not, i.e., to address \nsemantic validation of the analytic workflow. \n4.2. Overall approach \nAn overview of the proposed semantic model is illustrated in \nFig. 2 , which is arranged together with the underlying operational \nmodel, hence enabling actual composition of analytic workflows. \nIn this approach, BIGOWL is the ontological scheme driving the \nwhole process. It is the terminological box (TBox) that defines the \nvocabulary with concepts and properties in the domain of Big Data \nanalysis. As explained before, BIGOWL is developed in OWL 2 ac- \ncording to which, concepts are represented by classes and relations \nare represented by data properties or object properties. As repre- \nsented in Fig. 2 , BIGOWL is conceived as an abstract top-level on- \ntology that enables not only subontology replication e.g., to focus \non specific use cases or algorithmic families, but also linkage with \nexternal domain knowledge ontologies, which are oriented to the \nspecific problem domain (Smart Cities, Biology, etc.). \nAt bottom-level, the Assertional Box (ABox) defines all the in- \nstances in the knowledge domain (in OWL 2 an instance is rep- \nresented by an individual) involving the analytic workflows’ meta- \ndata. These instances are stored in RDF triple format in a Stardog 14 \nrepository, which is a commercial version of the Pellet OWL 2 rea- \nsoner ( Sirin, Parsia, Grau, Kalyanpur, & Katz, 2007 ), but enhanced \nwith persistence capabilities. Once the ontology (Tbox) has been \n14 http://www.stardog.com/ . loaded together with SWRL rules, a series of reasoning tasks are \nlaunched by using the Stardog OWL 2 reasoner to derive new infor- \nmation that is not explicitly expressed in the knowledge base. The \nnew information will indicate, when applicable and among others, \nwhether an analytic workflow is correctly composed, or not. \nIn this model, the Annotation Module is used to populate the \nRDF repository with new instances that involve the required meta- \ndata (annotated) to be used in workflows, for example: algorithms, \noperators, parameters, input/output (paths), data sources, database \nconnections, data sinks, software, execution order, etc. \nThe Operational Model will make use of these annotated meta- \ndata for driving the workflow composition. In this process, each \nstep a new component is to be selected and used, a SPARQL query \nis launched to obtain the required meta-data and to suggest the \nnext possible component/s to be included. \nA very simple (hypothetical) case of use would comprise the \nfollowing steps: \n(i) A user desires to extract patterns from a dataset and visual- \nize the results; \n(ii) Then, the user selects one algorithm from a list of data \nmining algorithms (in form of analysis component) queried \nthroughout the semantic model; \n(iii) The selected algorithm requires specific input parameters \nand data to train, so the semantic model will supply them; \n(iv) The initial dataset should be then formatted in form of data \ncollection task; \n(v) In case collected data need transformation, an intermediate \ndata processing component is included between collection \nand analysis;\n\n[Página 7]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 549 \nFig. 2. General overview of the semantic model that follows the ontology’s scheme of BIGOWL. The analytic operational model address the workflow composition driven by \nthe semantic model \n(vi) The semantic model will suggest suitable output component \n(visualization) to be linked after the analytic algorithm. \nIt is worth mentioning that each step in the workflow is instan- \ntiated by a task, which entails an execution order. Then, the entire \nworkflow is arranged according to all the ordering values in tasks. \nIn summary, the semantic model acts as a mediator between \ndata provider components and data consumers. It also acts as a \ndata source and meta-data registry with functions to make “agree- \nments” on the provision and traceability of the whole data value \nchain. \n5. Validation \nFor validation purposes, two different cases of study have been \ndeveloped to show how the proposed semantic approach is used \nfor driving the composition of data analytic workflows. The first \none is focused on Big Data streaming processing and optimiza- \ntion of real-world traffic routes in the domain of Smart Cities. The \nsecond case study is centered on classic data mining analysis on \nacademic problem instances, although considering local and cloud \ncomputing environments. In this way, we aim at covering, as much \nas possible, different aspects in Big Data applications: algorithmic \nanalyses (optimization and data mining), velocity and volume is- \nsues (streaming processing), real-world and academic data prob- \nlems, and Big Data ecosystems (Apache Spark local and on-premise \ncluster, BigML cloud SaaS API). \nIn these two cases, a similar semantic annotation and query- \ning procedure has been followed, which consists in the man- \nual annotation (guided by domain experts) of: algorithms, tech- \nnological/platform features, and attributes of problem domain of \nknowledge; and automatic querying by means of SPARQL sen- \ntences. To distinguish individuals belonging to each case study, \ntwo different namespaces has been defined, i.e. traffic: http:// \nwww.khaos.uma.es/perception/traffic/khaosteam# and weka: http: \n//www.khaos.uma.es/perception/weka/khaosteam# , respectively. 5.1. Case study 1: streaming processing of New York City traffic \nopen-data \nThe first case study consists in a dynamic version of the \nbi-objective Traveling Salesman Problem (TSP), to minimize the \n“travel time” and the “distance” to cover certain routing points \nin a urban area. The algorithm for solving it is a dynamic variant \nof the well-known multi-objective metaheuristic NSGA-II provided \nin jMetalSP ( Barba-González, García-Nieto, Nebro, Cordero, Durillo, \nNavas-Delgado, & Aldana-Montes, 2017 ), 15 which allows parallel \nprocessing of evaluation functions in Apache Spark environment. \nIn the case of the dynamic bi-objective TSP, which is formu- \nlated in terms of a distance matrix and a time travel matrix, the \nperiodic changes can affect any of them. Our particular dynamic \nTSP problem instance is based on real-world data. Specifically, it \nis feed from the Open Data API provided by the New York City \nDepartment of Transportation, 16 which updates traffic information \nseveral times per minute. The information is provided as a text file \nwhere each line includes the average speed to traverse the two end \npoints defining a link in the most recent interval. The goal is then, \ngiven a list of nodes in New York city and the distances between \neach pair of nodes, calculate the shortest possible route that visits \neach node. \nNew York’s traffic data is read periodically by an external appli- \ncation that writes a file in HDFS whenever new data are acquired, \nso we have implemented a streaming data component for that pur- \npose. This component reads periodically the new data appeared \nin the specific directory (this is done automatically by Spark) and \nmakes a simple processing: if a change in a link is detected (time \nor distance), then the corresponding problem matrices are up- \ndated. \nThe analysis of the streaming data sources can be carried out \nin parallel by using Spark. In fact, we used a Hadoop cluster com- \n15 https://www.github.com/jMetal/jMetalSP . \n16 https://www.data.cityofnewyork.us/Transportation/Real- Time- Traffic- Speed- Data/ \nxsat-x5sa .\n\n[Página 8]\n550 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nFig. 3. Workflow for dynamic bi-objective optimization of TSP problem instance with Open Data New York \nposed of 100 cores in the previous study where the Big Data op- \ntimization model was presented ( Barba-González et al., 2017 ). In \naddition, two other streaming data sources where used as sepa- \nrate components, which based on Twitter and Kafka. In the first \none, tweets are read from Twitter API with the topic “New York \ntraffic” and a processing of each tweet is simulated, so the prob- \nlem is updated in accordance with it (for testing purposes we set \nrandom changes in traffic scenario). This way, we combine a differ- \nent streaming source with the possibility of adjusting the process- \ning time, which will serve for performance evaluation purposes. In \nthe second source, the idea is to enrich the case study with an- \nother data source that will produce artificial data. Then we created \na Kafka message producer that generates, following uniform and \nnormal distributions, a series of random messages with data to up- \ndate the problem. Every 5 s at least 10 0 0 messages are produced, \nbut on average about 10,0 0 0 messages are created. Both the Twit- \nter and Kafka streaming source classes have the same behavior as \nthe HDFS based one: they iteratively collect and analyze the data \nto somehow update the problem. \nAfter data processing, the analytic task is then carried out, \nwhich entails dynamic optimization computed by NSGAII algo- \nrithm of the jMetalSP library. The results of the analysis are used \nto feed data sinks. In this case study, we consider two of them: \none that stores the produced Pareto fronts in HDFS, and other one \nthat visualizes information about the Pareto front approximation \n(as the number of solutions and the number of generated fronts) \nusing R-plot library. \nThe workflow implementing this case study is represented in \nFig. 3 , 17 where all the components are arranged according to data \nflow. In this workflow, the numeric indexes (1)–(7) correspond to \nthose steps as indicated in Table 8 , which contain the required \nSPARQL queries the semantic model apply to recommend forth- \ncoming component/s to use, in design time. For this case study, \nthe main set of individuals annotated in the semantic model and \ntheir relationships, are shown in Fig. 4 . Then it is possible to follow \nthe complete process step-by-step: \n•Step (1) . The workflow designer fetch all the optimization prob- \nlems from BIGOWL to select the implementation that better \nfits the required model for TSP instances. Interestingly, they are \nall subclasses of OptimizationProblem , which is integrated from \nDMOP. As a result, (s)he selects TSP. \n•Step (2) . Given a problem to solve, TSP in this case, the seman- \ntic model recommends a series of optimization algorithms that \ncould deal with it, i.e., those annotated algorithms that better \n17 Ontology instances available at https://www.github.com/KhaosResearch/ \nBIGOWL/blob/master/traffic.owl . adapt to the problem in terms of properties, such as: solution \nencoding, manages, dealWith , etc. After this, the designer selects \nNSGAII. \n•Step (3) . This is an intermediate step followed by the semantic \nmodel to recommend specific annotated component and task \ninstancing the underlying software that implements TSP and \nNSGAII. \n•Step (4) . Now, the objective of this query is to obtain the spe- \ncific data model to properly host data in problem and algorithm \ntasks. This step is thought to use specific domain knowledge \ninformation (traffic routes in this case) coming from external \nontologies. The resulting annotated instance here is MatrixNY , \nwhich refers to a data model comprising a matrix of points and \ndistances in the scenario of New York city. \n•Step (5) . Once the workflow designer has a clear idea about \nthe data model, (s)he can set data sources and connect them \nto feed the analysis. The semantic model is then queried to \nshow all possible data collectors, i.e., those previously anno- \ntated. Among all the resulting possibilities, ReadWebNYDataTraf- \nfic, DataCollectionDataTrafficKafka and DataCollectionTwitter are \nselected for this case study. \n•Step (6) . Before connecting data sources to analytic component, \na previous task is required for data processing and consolida- \ntion. In this case study, the corresponding component is im- \nplemented as a Spark processing task to join Kafka messages, \nTweets and traffic data streams. \n•Step (7) . Last steps usually correspond to data sink tasks to al- \nlocate results from analyses. For this case study, Visualization- \nTask and HDFSStoreTask are selected, which implement R-plot \nvisualization and storage in HDFS, respectively. \n•Step (8) . Finally, the semantic model is queried to obtain \nthe corresponding task instances that are mutually compati- \nble among them. The analytic workflow is now ready to be \nlaunched on the underlying running platform. \nMoreover, once the whole process is completed, a further rea- \nsoning procedure can now be started to check whether the gen- \nerated workflow is semantically consistent, or not. This reasoning \ntask will be explained in Section 5.3 . \n5.2. Case study 2: classification with Iris flower dataset \nAs commented before, the second case study consists in the \nacademic problem of Irish flower classification by means of deci- \nsion tree J48, a classical algorithm for data mining analytics. For \nmaterialization, two different approaches have been used in this \ncase: the well-known library for data mining Weka and the BigML \nSaaS API for analysis on-cloud. The aim is to illustrate how similar \nannotation and querying procedures with BIGOWL can be used to\n\n[Página 9]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 551 \nTable 8 \nSPARQL queries for case study of streaming processing of New York city traffic open-data. \nStep SPARQL Result\n(1)SELECT DISTINCT ?problem WHERE {\n?problem rdf:type ?type .\n?type rdfs:subClassOf* dmop:OptimizationProblem .}TSP, ZDT1, ZDT2, ZDT3, ZDT4,\nZDT5, ZDT6, Kursawe..\n(2)SELECT DISTINCT ?algorithm\n(count(DISTINCT ?propertiesAlgorithm) AS numProperties)\nWHERE {\ntraffic:TSP bigowl:encodedBy ?solution.?algorithm rdf:type ?type.?type rdfs:subClassOf* bigowl:OptimizationAlgorithm.\n?entity bigowl:manages ?solution .\n?algorithm bigowl:dealWith ?propertiesAlgorithm .\ntraffic:TSP bigowl:hasFeature ?propertiesTSP .\nFILTER ( ?propertiesTSP in (?propertiesAlgorithm)).\n} GROUP BY ?algorithm ORDER BY DESC(?numProperties)NSGAII, MOCell,\nSMSEMOA,SPEA2, IBEA, PAES,\nPESA2, WASFGA\n(3)SELECT distinct ?comp ?task WHERE {\n?comp bigowl:hasProblem traffic:TSP .\n?comp bigowl:hasAlgorithm traffic:NSGAII .\n?comp rdf:type bigowl:Optimization .?task rdf:type bigowl:Task . ?task bigowl:hasComponent ?comp. }OptmimizationComponent,\nOptimizationTask\n(4)SELECT distinct ?data WHERE {\n?comp bigowl:hasProblem traffic:TSP .\n?comp bigowl:hasAlgorithm traffic:NSGAII .\n?comp rdf:type bigowl:Optimization .\n?task rdf:type bigowl:Task . ?task bigowl:hasComponent ?comp.\n?task bigowl:specifiesInputClass ?data . }MatrixNY\n(5)SELECT distinct ?dataCollection WHERE {\n?dataCollection rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataCollection.}ReadWebNYDataTraffic,\nDataCollectionHDFS,\nDataCollectionDataTrafficKafka,\nDataCollectionTwitter,\nDataCollectionDB, ...\n(6)SELECT distinct ?taskProcessing ?compProcessing WHERE {\n?taskCollection bigowl:hasComponent bigowl:ReadNYDataTraffic.\n?taskCollection bigowl:specifiesOutputClass ?out.\n?dataProcessing rdf:type ?typeProcessing .\n?typeProcessing rdfs:subClassOf* bigowl:DataProcessing.\n?taskProcessing bigowl:hasComponent ?dataProcessing .?taskProcessing bigowl:specifiesInputClass ?out.\n?taskProcessing bigowl:specifiesOutputClass traffic:MatrixNY. }SparkTask, ComponentSpark\n(7)SELECT distinct ?dataSink WHERE {\n?dataSink rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataSink.}VisualizationPlot,\nDataSinkHDFSStore,\nDataSinkOracleStore, ...\n(8)SELECT distinct ?task1 ?task2 WHERE {\n?task1 rdf:type bigowl:Task . ?task2 rdf:type bigowl:Task .\n?task1 bigowl:specifiesOutputClass ?output .\n?task2 bigowl:specifiesInputClass ?output . }GeneratorDataTrafficTask,\nSparkTask, TwitterCollectorTask,\nKafkaMGTask,\nReadNYDataTrafficTask,\nOptimizationTask, VisualizationTask\ncompose workflows on different platforms when solving the same \nproblem. \nFig. 5 shows the individuals (and their relationships) anno- \ntated in the ontology, and Fig. 6 18 represents graphically the an- \nalytic workflow for this case study. The numeric labels (1)–(5) are \n18 Ontology instances available at https://www.github.com/KhaosResearch/ \nBIGOWL/blob/master/weka.owl . aligned with their corresponding steps in Table 9 that contain the \nSPARQL queries used and their results. \nIn a nutshell, steps (1)–(3) are used to guide the workflow de- \nsigner on the selection of data model, algorithm, and analysis com- \nponents and tasks, respectively. Step (4) is used to query suit- \nable data collector components, in this case the designer selects \nDataCollectionBigML for BigML API instance and DataCollectorFS for \nWeka instance dataset. Step (5) queries are devoted to select possi- \nble data sink components, and specifically DataSinkFSStore and Vi-\n\n[Página 10]\n552 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nFig. 4. BIGOWL’s individuals annotated in the workflow for dynamic bi-objective optimization of TSP problem \nFig. 5. BIGOWL’s individuals in workflow for Irish flower classification with J48 decision tree instanced from Weka \nFig. 6. Workflow for Irish flower classification with J48 decision tree instanced from \nWeka and BigML. sualizationPlot , which implement orders to save results in file sys- \ntem and API method for plotting in BigML, respectively. Finally, \nstep (6) obtains the corresponding task instances that are mutu- \nally compatible among them throughout the complete workflow. \n5.3. Reasoning with BIGOWL \nReasoning procedure is built in BIGOWL with formulation of se- \nmantic rules on top of the OWL ontology, to deduce new informa- \ntion from the existing knowledge. These rules are formulated in \nSWRL and used to perform semantic reasoning jobs mainly de- \nvoted to check correctness of workflows, e.i., to discover those \ncomponents and tasks with (non-)compatible connectivity of in- \nputs/outputs, execution orders, data domains, data formats, data\n\n[Página 11]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 553 \nTable 9 \nSPARQL queries for case study Irish flower classification on Weka, as well as on BigML. \nStep SPARQL Result\n(1)SELECT DISTINCT ?individual\nWHERE {\n?individual rdf:type ?type .\n?type rdfs:subClassOf* bigowl:DMDataClass .\n}Iris, Contact-lens, CPU, Diabetes,\nGlass, Ionosphre, Labor,\nReutersCorn, Segment,..\n(2)SELECT ?algorithm\nWHERE {\nweka:Iris rdf:type ?typeD .?typeD rdfs:subClassOf* ?classSomePropertyAlgorithm.?algorithm rdf:type ?type.?type rdfs:subClassOf* bigowl:DataMiningAlgorithm.\nbigowl:DataMiningAlgorithm rdfs:subClassOf* [\na owl:Restriction ;\nowl:onProperty bigowl:manages ;\nowl:someValuesFrom ?classSomePropertyAlgorithm ] .\n}J48, LogisticRegression, NaiveBayes,\nRepTree, IBk, LinearNNSearch,\nSMO, ...\n(3)SELECT distinct ?comp ?taskWHERE {\n?comp bigowl:hasAlgorithm weka:J48 .?task rdf:type bigowl:Task .\n?task bigowl:hasComponent ?comp. } ClassificationJ48Component,\nClassificationJ48Task\n(4)SELECT distinct ?dataCollection WHERE {\n?dataCollection rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataCollection.}DataCollectionOpenData,\nDataCollectionBigML,\nDataCollectionHDFS,\nDataCollectorFS, ...\n(5)SELECT distinct ?dataSink WHERE {\n?dataSink rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataSink.}VisualizationPlot,\nDataSinkHDFSStore,\nDataSinkOracleStore,\nDataSinkFSStore, ...\n(6)SELECT distinct ?task1 ?task2 WHERE {\n?task1 rdf:type bigowl:Task . ?task2 rdf:type bigowl:Task .\n?task1 bigowl:specifiesOutputClass ?output .?task2 bigowl:specifiesInputClass ?output . }ClassAsignerIrisTask,\nClassificationJ48Task,\nClassifierPerformanceEvaluatorTask,\nCrossValidaionFolderMarkerTask,\nTextViewerTask\ntypes, etc. SWRL rules are then evaluated by the reasoner after \nclassifying Big Data components in accordance with axioms, as de- \nfined in Table 1 . In concrete, there are two types of axioms associ- \nated with OWL-DL classes for reasoning, namely: subClassOf , which \nis used to define the necessary conditions for a class to be consid- \nered a member of a given OWL class; and equivalentClass , for an- \nnotating when two classes can be considered as equivalent, if they \ncomply the conditions. \nBIGOWL imports subClassOf axioms from DMOP to specify tax- \nonomy classification of Data Mining contexts and their data. In this \nsense, subclasses are also the natural way of describing hierarchy \nof algorithmic families and versions in optimization analyses. For \ninstance, Genetic Algorithms are subclasses of Evolutionary Algo- \nrithms and these in turn, are subclasses of Population Based Algo- \nrithms. This structural information is then considered in reasoning \ntime for algorithm recommendation. The main axioms for subclass \nclassification are defined in Table 10 , which correspond to Data \nMining and Optimization algorithmic families. \nFurthermore, a series of specific SWRL rules are described for \nassessing the compatibility of components. As commented before, \nthe main goal is to address the generation of well-formed Big Data \nworkflows. A description of these rules is as follows: - Compatibility between task, component and Data Mining \nalgorithm . This rule is used to check that input data model is com- \npatible with the task that is indeed an instance (or implementa- \ntion) of a component. In this specific case, the used component \nrefers to a Data Mining Algorithm to perform a specific analysis. \nIn short, this rule is used by the reasoner to validate compatibility \nbetween data mining component and data source. The result is a \npredicate indicating that data “feeding” the component are com- \npatible with the analytic algorithm, so a task can be launched to \nrun it on the underlying platform. \nbigowl:specifiesInputClass(?task, ?data) ˆ\nbigowl:hasComponent(?task, ?comp) ˆbigowl:hasAlgorithm(?comp, ?alg) ˆbigowl:DataMiningAlgorithm(?alg) ˆ\nbigowl:DMDataClass(?data)\n-> bigowl:isCorrect(?alg, ?data)\nNote that a similar rule is defined in the semantic model to \nconsider optimization algorithms. \n- Compatibility between tasks of a workflow . This rule is ap- \nplied to a complete workflow. It is used to check that input/output \ndata connections of each pair of consecutive tasks are “semanti-\n\n[Página 12]\n554 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 10 \nOWL axioms for algorithmic subclass classification. \nClass Classification rule\nOptimization AlgorithmOptimizationAlgorithm subClassOf\n((implements some OptimizationStrategy) and\n(resolves some OptimizationProblem)) or Algorithm\nDataMining AlgorithmOptimizationAlgorithm subClassOf\n(manages some DMDataClass) or Algorithm\nOptimization ComponentOptimization subClassOf (hasAlgorithm only\n(OptimizationAlgorithm or MachineLearning))\nDataMining ComponentDataMining subClassOf (hasAlgorithm only\n(DataMiningAlgorithm or MachineLearning))\ncally” similar. The outcome is a new predicate indicating whether \neach two consecutive tasks are mutually compatible, or not. \nWorkflow(?w) ˆ\nbigowl:hasTask(?w, ?task1) ˆbigowl:order(?task1, ?ord1) ˆbigowl:hasTask(?w, ?task2) ˆ\nbigowl:order(?task2, ?ord2) ˆ\nswrlb:add(?ord2, ?ord1, 1) ˆbigowl:specifiesInputClass(?task2, ?data)ˆ\nbigowl:specifiesOutputClass(?task1, ?data)\n-> bigowl:compatibleWith(?task1, ?task2)\n- Connectivity between tasks and data . Similarly to the pre- \nvious one, this rule is used to indicate that two instances of tasks \nare properly linked, that is to say, it checks that the input data of \ntask2 are covered with the output data of task1 , according to \nthe execution order established in the workflow. \nWorkflow(?w) ˆbigowl:hasTask(?w, ?task1) ˆbigowl:order(?task1, ?ord1) ˆbigowl:hasTask(?w, ?task2) ˆ\nbigowl:order(?task2, ?ord2) ˆ\nswrlb:add(?ord2, ?ord1, 1) ˆbigowl:specifiesInputClass(?task2, ?data) ˆ\nbigowl:specifiesOutputClass(?task1, ?data)\n-> bigowl:isConnected(?task2, ?data)\n- Workflow correctness . Finally, this rule validates that all the \ncomponents, instanced by corresponding tasks and data sources, \nare correctly arranged and connected. The result is then a new \npredicate indicating whether the complete workflow is correct, or \nnot. \nWorkflow(?w) ˆbigowl:hasTask(?w, ?task) ˆbigowl:numberOfInput(?task, ?nIn) ˆbigowl:isConnected(?task, ?data).\nsqwrl:makeSet(?set, ?data) ˆ\nsqwrl:groupBy(?set, ?task).sqwrl:size(?cont, ?set) ˆ\nswrlb:equal(?cont, ?nIn)\n-> sqwrl:select(?cont, ?nIn, ?task) ˆbigowl:isCorrectWorkflow(?w, true)In summary, these case studies are used as a “proof of concept ”\nto somehow highlight that the proposed semantic model is able to \nsupport in the design of Big Data analytics. In this regard, BIGOWL \nenables automatic SPARQL querying for component recommenda- \ntion, as well as reasoning procedures for workflow validation. \n6. Discussions \nOne of the main research findings we claim with the design \nand implementation of BIGOWL is the ability to represent and con- \nsolidate knowledge involving Big Data analytics. This semantic ap- \nproach allows us to annotate (i.e. to “semantize”) all the meta- \ndata flowing from multiple data sources, processing components \nand analytic algorithms. The meta-data are integrated following \nthe BIGOWL structure and stored in an RDF repository. \nOn the one hand, the results obtained in the two case stud- \nies indicate that, driven by the ontological model, it is possible \nto progressively deliver component recommendations for the con- \nstruction of Big Data analytics workflows. The resulting workflows \nare indeed enhanced with semantic knowledge that explicitly de- \nscribes and registers the data lineage (data provenance in database \nsystems), from sources to results. It also would enable to replay \nspecific portions or inputs of the data flow for step-wise debug- \nging or regenerating lost outputs. In the BIGOWL semantic model, \ndata linage is mapped with RDF triples referring to records of the \ninputs, entities, systems, algorithms and processes that influence \ndata of interest, hence providing a historical record of the data ob- \ntained (as results) and its origins (as sources). \nBased on the analysis provided in the two cases studies, the \nuser is able to identify the correct path the data follow and how \nthey are modified to obtain added value, for a given domain of \nknowledge. For example, in the first case study, a series of data \nsources involving information about urban traffic in the city of \nNew York (with geo-locations, travel times, densities, tweets, etc.) \nare semantically related (or linked) to the results obtained, in form \nof optimized routes in a problem characterization of the classical \nTSP. In this case study, the outputs are encoded in form of routes, \nwhere the travel time and the routing distance are optimized. This \nway, the resulting routes are linked to the traffic densities and the \nTwitter messages, so the data lineage is registered with semantic \nannotations. \nSimilarly, in the second case study, it is possible to connect \nprediction accuracies with classification algorithms, for the Irish \nflower database. In addition, the running experiences acquired \nwhen using different execution frameworks, e.g., in-house/in-cloud, \nare also annotated as results.\n\n[Página 13]\nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 555 \nAnother important finding lies in the possibility of using the \nsemantic knowledge-base, now consolidated in the RDF repository, \nto perform reasoning tasks, hence to infer new knowledge. In this \nstudy, a series of SWRL rules are used to train the reasoner. In this \nstudy, a reasoner is used to evaluate a set of SWRL rules defined \nfor the specific task of workflow validation. In this regard, the val- \nidation analysis performed by the reasoner required 644 ms for \ncase study 1 and 673 ms for case study 2. Taking into account that \nwe used the Stardog OWL 2 reasoner, the time spent in reasoning \ntasks is acceptable for workflow validation. \nOn the other hand, the main constraint of the proposed seman- \ntic model is that it needs a domain ontology to cover the prob- \nlem knowledge domain. This domain ontology contains the spe- \ncific concepts for a given case, so it can be reused in domains \nwhere previous efforts provided such model. However, if such on- \ntology is not available, then its design is required. As explained \nin Section 4.1 , the class Data in BIGOWL is used, not only to an- \nnotate all the data flowing in the analytic workflow, but also to \nallow alignment with third parties’ ontologies covering the spe- \ncific problem domain of knowledge. Additionally, the general on- \ntology could miss concepts that would be needed in some cases \nand are not described in the current model. This constraint can be \nsolved by proposing an extension, in form of new version release \nof BIGOWL, though a collaborative portal. In this sense, BIGOWL is \npublicly available at WebProtégé, 19 where any registered user can \nintroduce changes. These changes will be reviewed in a regular ba- \nsis to approve or reject them. The last stable version of the ontol- \nogy will be provided in the project GitHub repository. 20 \nIn addition, a secondary constraint arises when a new workflow \nis generated or executed by a user, since a series of new annota- \ntions are required to store all the meta-data involved in the data \nanalytic process, in form of RDF triples. This makes the RDF repos- \nitory to increase significantly, which would promote, not only fu- \nture reasoning procedures to infer new knowledge from these data, \nbut also their connection with other Linked Data. In this sense, \nthe efficient management of large RDF repositories has become a \nchallenging task attracting many scholars to research ( Zomaya & \nSakr, 2017 ), which means a clear implication for academia. \nIn terms of practical implications, the proposed semantic model \nrepresents an initial demonstrator for the experimental piloting of \nBig Data frameworks enhanced with semantics. The objective is to \nobtain “Smart Data” and promote the data value chain in industry \nprocesses, which is a key challenge nowadays as reflected in the \nStrategic Research and Innovation Agenda of the Big Data Value As- \nsociation (EU SRIA 4.0 BDVA). 21 Several industrial projects in this \nassociation, like BigDataEurope 22 and BigOceanData, 23 are focused \non exploiting semantics in Big Data analytics, so they could par- \ntially take advantage of BIGOWL as reference ontological model. \n7. Conclusions \nIn this work, an ontological approach called BIGOWL is pro- \nposed to provide a conceptual framework for the annotation of \nBig Data analytics. The proposed semantic model is materialized \nby means of an RDF repository, and programmatic querying and \nreasoning functions. \nTo test the initial hypothesis, two case studies have been devel- \noped, which consist in: (1) real-world streaming traffic data pro- \ncessing for route optimization in urban environment, and (2) aca- \ndemic data mining classification on local/on-cloud platforms. The \n19 WebProtégé https://www.goo.gl/F6fYUc . \n20 GitHub https://www.github.com/KhaosResearch/BIGOWL . \n21 http://www.bdva.eu/sites/default/files/BDVA _ SRIA _ v4 _ Ed1.1.pdf . \n22 https://www.big- data- europe.eu/ . \n23 http://www.bigoceandata.com/ . experience on these cases revealed that BIGOWL approach is useful \nwhen integrating knowledge domain concerning a specific analytic \nproblem. Consequently, the integrated knowledge is used for guid- \ning the design of Big Data analytics workflows, by recommending \nnext components to be linked, and supporting final validation. \nIt is worthy to declare that the proposed semantic model is cur- \nrently populated with those annotated elements required to set the \ncase studies reported in this work, although it can be feed with \nnew instances regarding other Big Data workflows. \nThis motivates our future research agenda, which entails a \nfirst phase to provide automatic facilities for ontology population, \nhence to enrich the semantic approach; second, to provide new \nmechanisms to promote the use of contextual domain of knowl- \nedge in the generation of Big Data analytic solutions; and third, to \ngenerate new and heterogeneous use cases of analytics workflows \nthat would led us to find and solve new possible deficiencies, as \nwell as to enrich the knowledge base. \nReferences \nACM-SIGKDD (2014). Data mining curriculum. ACM SIGKDD 2006-04-30. Retrieved \n2014-01-27. \nAllahyari, M. , Kochut, K. , & Janik, M. (2014). Ontology-based text classification into \ndynamically defined topics. In 2014 IEEE international conference on semantic \ncomputing (pp. 273–278) . \nBarba-González, C. , García-Nieto, J. , Nebro, A. J. , Cordero, J. A. , Durillo, J. J. , \nNavas-Delgado, I. , et al. (2017). Jmetalsp: A framework for dynamic multi-ob- \njective big data optimization. Applied Soft Computing . In–Press–Online \nDiamantini, C., Potena, D., & Storti, E.. Ontology-driven kdd process composition. \nDou, D. , Wang, H. , & Liu, H. (2015). Semantic data mining: A survey of ontolo- \ngy-based approaches. In Semantic computing (icsc), 2015 ieee international con- \nference on (pp. 244–251). IEEE . \nGrosof, B. N. , & Poon, T. C. (2004). SweetDeal: Representing agent contracts with \nexceptions using semantic web rules, ontologies, and process descriptions. In- \nternational Journal of Electronic Commerce, 8 (4), 61–97 . \nGruber, T. R. (1995). Toward principles for the design of ontologies used for \nknowledge sharing? International Journal of Human-Computer Studies, 43 (5–6), \n907–928 . \nGruber, T. R. , et al. (1993). A translation approach to portable ontology specifica- \ntions. Knowledge Acquisition, 5 (2), 199–220 . \nHarris, S. , Seaborne, A. , & Prud’hommeaux, E. (2013). Sparql 1.1 query language. W3C \nRecommendation, 21 (10) . \nHorrocks, I. , Patel-Schneider, P. F. , Bechhofer, S. , & Tsarkov, D. (2005). OWL rules: \nA proposal and prototype implementation. Web Semantics: Science, Services and \nAgents on the World Wide Web, 3 (1), 23–40 . \nJing, L. , Ng, M. , & Huang, J. (2010). Knowledge-based vector space model for text \nclustering. Knowledge and Information Systems, 25 (1), 35–55 . \nKeet, C. , Ławrynowicz, A. , d’Amato, C. , Kalousis, A. , Nguyen, P. , & Palma, R. (2015). \nThe data mining optimization ontology. Web Semantics, 32 , 43–53 . \nKietz, J. , Serban, F. , Bernstein, A. , & Fischer, S. (2010). Data mining workflow tem- \nplates for intelligent discovery assistance and auto-experimentation. In Proceed- \nings- of the ecml/pkdd: 10 (pp. 1–12) . \nKonys, A. (2016). Ontology-based approaches to big data analytics. In International \nmulti-conference on advanced computer systems (pp. 355–365) . \nKuiler, E. W. (2014). From big data to knowledge: An ontological approach to big \ndata analytics. Review of Policy Research, 31 (4), 311–318 . \nKumara, B. T. G. S. , Paik, I. , Zhang, J. , Siriweera, T. H. A. S. , & Koswatte, K. R. C. (2015). \nOntology-based workflow generation for intelligent big data analytics. In 2015 \nieee international conference on web services (pp. 495–502) . \nLi, L. , Yevseyeva, I. , Basto-Fernandes, V. , Trautmann, H. , Jing, N. , & Em- \nmerich, M. (2017). Building and using an ontology of preference-based multi- \nobjective evolutionary algorithms. In H. Trautmann, G. Rudolph, K. Klamroth, \nO. Schütze, M. Wiecek, Y. Jin, & C. Grimme (Eds.), Evolutionary multi-criterion \noptimization: 9th international conference, EMO 2017, Münster, Germany, March \n19–22, 2017, proceedings (pp. 406–421). Cham: Springer International Publish- \ning . \nMarinica, C. , & Guillet, F. (2010). Knowledge-based interactive postmining of associa- \ntion rules using ontologies. IEEE Transactions on Knowledge and Data Engineering, \n22 (6), 784–797 . \nMasolo, C. , Borgo, S. , Gangemi, A. , Guarino, N. , & Oltramari, A. (2003). Wonderweb \ndeliverable d18, ontology library (final). ICT Project, 33052 . \nMcBride, B. (2004). The resource description framework (rdf) and its vocabulary de- \nscription language rdfs. In Handbook on ontologies (pp. 51–65). Springer . \nMcGuinness, D. L. , Van Harmelen, F. , et al. (2004). Owl web ontology language \noverview. W3C Recommendation, 10 (10), 2004 . \nNguyen, P. , Hilario, M. , & Kalousis, A. (2014). Using meta-mining to support data \nmining workflow planning and optimization. Journal of Artificial Intelligence Re- \nsearch, 51 , 605–644 . \nNoy, N. , & McGuinness, D. L. (2001). Ontology development 101: A guide to creating \nyour first ontology. Technical report .\n\n[Página 14]\n556 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nNoy, N. F., McGuinness, D. L. et al. (2001). Ontology development 101: A guide to \ncreating your first ontology. \nPhan, N. , Dou, D. , Wang, H. , Kil, D. , & Piniewski, B. (2015). Ontology-based deep \nlearning for human behavior prediction in health social networks. In Proceed- \nings of the 6th ACM conference on bioinformatics, computational biology and health \ninformatics (pp. 433–442). ACM . \nPinto, A. , Scioscia, F. , Loseto, G. , Ruta, M. , Bove, E. , & Sciascio, E. D. (2015). A seman- \ntic-based approach for machine learning data analysis. In 2015 IEEE international \nconference on semantic computing (ICSC) (pp. 324–327) . \nPrud, E. , & Seaborne, A. (2006). Sparql query language for rdf. W3C Recommendation . \nRistoski, P. , & Paulheim, H. (2016). Semantic web in data mining and knowledge \ndiscovery: A comprehensive survey. Web Semantics: Science, Services and Agents \non the World Wide Web, 36 , 1–22 . \nRoldán-García, M. , García-Nieto, J. , & Aldana-Montes, J. F. (2017). Enhancing seman- \ntic consistency in anti-fraud rule-based expert systems. Expert Systems with Ap- \nplications, 90 (Supplement C), 332–343 . Shearer, C. (20 0 0). The crisp-dm model: The new blueprint for data mining. Journal \nof Data Warehousing, 5 (4), 13–22 . \nSirin, E. , Parsia, B. , Grau, B. C. , Kalyanpur, A. , & Katz, Y. (2007). Pellet: A practical \nowl-dl reasoner. Web Semantics: Science, Services and Agents on the WWW, 5 (2), \n51–53 . \nStaab, S. , & Studer, R. (2013). Handbook on ontologies . Springer Science & Business \nMedia . \nYaman, A. , Hallawa, A. , Coler, M. , & Iacca, G. (2017). Presenting the ECO: Evolution- \nary computation ontology. In European conference on the applications of evolu- \ntionary computation (pp. 603–619) . \nZáková, M. , Kremen, P. , Zelezny, F. , & Lavrac, N. (2011). Automating knowledge dis- \ncovery workflow composition through ontology-based planning. IEEE Transac- \ntions on Automation Science and Engineering, 8 (2), 253–264 . \nZomaya, A. Y. , & Sakr, S. (2017). Handbook of big data technologies (1st). Springer \nInternational Publishing .",
+ "a2079249-0ae0-4430-8573-2c14b24a8efe": {
+ "content": "Expert Systems With Applications 115 (2019) 543–556 \nContents lists available at ScienceDirect \nExpert Systems With Applications \njournal homepage: www.elsevier.com/locate/eswa \nBIGOWL: Knowledge centered Big Data analytics /p82 \nCristóbal Barba-González, José García-Nieto ∗, María del Mar Roldán-García, \nIsmael Navas-Delgado, Antonio J. Nebro, José F. Aldana-Montes \nDepartmento de Lenguajes y Ciencias de la Computación, University of Málaga, ETSI Informática, Campus de Teatinos, Málaga 29071, Spain \na r t i c l e i n f o \nArticle history: \nReceived 5 April 2018 \nRevised 26 July 2018 \nAccepted 14 August 2018 \nAvailable online 23 August 2018 \nKeywords: \nOntology \nBig Data analytics \nSemantics \nKnowledge extraction a b s t r a c t \nKnowledge extraction and incorporation is currently considered to be beneficial for efficient Big Data an- \nalytics. Knowledge can take part in workflow design, constraint definition, parameter selection and con- \nfiguration, human interactive and decision-making strategies. This paper proposes BIGOWL, an ontology \nto support knowledge management in Big Data analytics. BIGOWL is designed to cover a wide vocab- \nulary of terms concerning Big Data analytics workflows, including their components and how they are \nconnected, from data sources to the analytics visualization. It also takes into consideration aspects such \nas parameters, restrictions and formats. This ontology defines not only the taxonomic relationships be- \ntween the different concepts, but also instances representing specific individuals to guide the users in \nthe design of Big Data analytics workflows. For testing purposes, two case studies are developed, which \nconsists in: first, real-world streaming processing with Spark of traffic Open Data, for route optimization \nin urban environment of New York city; and second, data mining classification of an academic dataset on \nlocal/cloud platforms. The analytics workflows resulting from the BIGOWL semantic model are validated \nand successfully evaluated. \n©2 0 1 8 Elsevier Ltd. All rights reserved. \n1. Introduction \nIn accordance with the recent Gartner’s report, 1 an emerging \nchallenge in Big Data is to construct data-driven intelligent appli- \ncations that capture and inject domain knowledge in the analyt- \nical processes, including context and using a standardized format. \nContext refers to all the relevant (meta)-information to support the \nanalysis and to help interpreting its results. This will facilitate the \nintegration (in a standardized way) with third parties’ data, algo- \nrithms, business intelligence (BI) and visualization services. \nThe use of semantics as contextual information will enhance \nthe analytical power of the algorithms, as well as the reuse of \nsingle components in data analytics workflows ( Ristoski & Paul- \n/p82 This work has been partially funded by Grants TIN2014-58304, TIN2017-86049- \nR (Spanish Ministry of Education and Science) and P12-TIC-1519 (Plan Andaluz de \nInvestigación, Desarrollo e Innovación). Cristóbal Barba-González is supported by \nGrant BES-2015-072209 (Spanish Ministry of Economy and Competitiveness). José\nGarcía-Nieto is the recipient of a Post-Doctoral fellowship of “Captación de Talento \npara la Investigación” Plan Propio at Universidad de Málaga. \n∗Corresponding author. \nE-mail addresses: cbarba@lcc.uma.es (C. Barba-González), jnieto@lcc.uma.es \n(J. García-Nieto), mmar@lcc.uma.es (M.d.M. Roldán-García), ismael@lcc.uma.es (I. \nNavas-Delgado), antonio@lcc.uma.es (A.J. Nebro), jfam@lcc.uma.es (J.F. Aldana- \nMontes). \n1 https://www.gartner.com/doc/3656517/adopt-datadriven-approach- \nconsolidating-infrastructure . heim, 2016 ). Therefore, the development of ways to make the do- \nmain knowledge explicit and usable is needed to improve the \ndata processing and analysis tasks. The Semantic Web technolo- \ngies can be used to annotate not only the knowledge domain \nof the data, but also the analytics’ meta-data ( Keet, Ławrynow- \nicz, d’Amato, Kalousis, Nguyen, Palma, Stevens, & Hilario, 2015 ), \nincluding: algorithms’ parameters, input variables, tuning experi- \nences, expected behaviors and taxonomies. This will facilitate the \nreuse and composition of Big Data analytics in a proper manner, as \nwell as to enhance the quality of consumed and produced data. \nIn this regard, ontologies describe concepts, relationships, \nclasses, individuals, formal logic axioms and objects of a particu- \nlar domain ( Gruber, 1995 ). The objects refer to entities and events \n(concepts) in the real world, and their relations represent the se- \nmantic links between these entities. A series of studies have been \nappearing in the last few years, in which ontological approaches \nare suggested to enhance Big Data analytics ( Konys, 2016; Kuiler, \n2014 ). However, they are presented as conceptual frameworks, still \nin an early stage of development, and mostly oriented to the spe- \ncific domain of health system applications. \nThis motivates us to propose an ontology-driven approach to \nsupport knowledge management in Big Data analytics workflows. \nThe proposed ontology is called BIGOWL (BIG data analytics OWL 2 \n2 OWL refers to the Web Ontology Language described in Section 2.1 . \nhttps://doi.org/10.1016/j.eswa.2018.08.026 \n0957-4174/© 2018 Elsevier Ltd. All rights reserved. \n544 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nontology), which acts as a formal schema for the representation \nand consolidation of knowledge in Big Data analytics. Knowledge \nincorporation is in turn beneficial for an efficient algorithmic per- \nformance, by taking part in operator’s design, parameter selection, \nhuman interactive and decision-making strategies. \nOur scientific hypothesis is as follows: “The semantic annotation \nof Big Data sources, components and algorithms can acts as a link to \ncapture and incorporate the domain knowledge to guide and enhance \nthe analytical processes ”. In addition, the semantic annotation can \nprovide the background for reasoning methods based on axiomatic \nand rule logic recommendations. \nTo test this hypothesis, a semantic model has been gener- \nated, which comprises an RDF 3 (Resource Description Framework) \nrepository that follows the BIGOWL scheme. This repository can be \nqueried by high level algorithms using SPARQL. The goal is to prop- \nerly feed artificial intelligence procedures capable of guiding the \ndesign of Big Data analytics workflows. \nAs a proof-of-concept, we show how BIGOWL can be used to \nguide the design of real-world and academic analytic workflows. \nA first case study consists in optimizing vehicular routes based on \nNew York real-time Open Data about urban traffic (average speeds \nof vehicles, traffic densities, etc.). 4 The data source is managed by \nstreaming processing tasks (Kafka and Spark), after which they are \noptimized (jMetalSP 5 ) and visualized. The second case study is a \nclassification workflow modeled by using the popular Weka 6 li- \nbrary for data mining, as well as the BigML in-cloud service. 7 \nThe main contributions of this study are: \n•The proposed ontology, BIGOWL, has been designed and imple- \nmented for the representation and consolidation of knowledge \nin Big Data analytics. It considers a large and complemented set \nof concepts, attributes and relationships that have been taken \nfrom Big Data ecosystem. \n•A semantic approach has been implemented to annotate (i.e. \nto “semantize”) all the involved meta-data from multiple data \nsources, processing components and analytic algorithms. The \nmeta-data are integrated following the BIGOWL structure and \nstored in a common RDF repository. \n•The semantic model is evaluated in the context of two realis- \ntic use cases: real-time routing calculation in urban traffic and \nclassical classification with decision trees. The proof-of-concept \nlead us to test our initial hypothesis. \nThe remaining of this paper is structured as follows. In \nSection 2 , background concepts and literature overview are pre- \nsented. Section 3 presents current practices in Big Data analyt- \nics. Section 4 describes the semantic model, comprising the on- \ntology, RDF repository, mappings and workflow composition assis- \ntant. Section 5 presents the use case for testing and validation. In \nSection 6 , a series of discussions are included. Conclusions and fu- \nture work are drawn in Section 7 . \n2. Background and related work \nTo make this paper self-contained, this section describes back- \nground concepts in the Semantic Web field. A review of the state \nof the art is also provided to point out the main differences of the \nrelated works with the proposed approach. \n3 RDF in W3C https://www.w3.org/RDF/ . \n4 https://www.data.cityofnewyork.us/Transportation/Real- Time- Traffic- Speed- Data/ \nxsat-x5sa . \n5 http://www.jmetal.sourceforge.net/ . \n6 https://www.cs.waikato.ac.nz/ml/weka/ . \n7 https://www.bigml.com/ . Table 1 \nBasic OWL-DL semantic syntax used to formally define the proposed \nontology. \nDescriptions Abstract syntax DL syntax \nOperators intersection ( C 1 , C 2 , /22c5/22c5/22c5, C n ) C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \nunion ( C 1 , C 2 , /22c5/22c5/22c5, C n ) C 1 /2294C 2 /2294/22c5/22c5/22c5/2293C n \nRestrictions for at least 1 value V from C ∃ V.C \nfor all values V from C ∀ V.C \nR is Symmetric R ≡R −\nClass Axioms A partial ( C 1 , C 2 , /22c5/22c5/22c5, C n ) A /subsetsqequal C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \nA complete ( C 1 , C 2 , /22c5/22c5/22c5, C n ) A ≡C 1 /2293C 2 /2293/22c5/22c5/22c5/2293C n \n2.1. Background concepts \n•Ontology. In accordance with Noy, McGuinness et al. (2001) , an \nontology provides a formal representation of the real world. \nIt defines an explicit description of concepts in a domain of \ndiscourse (classes or concepts), properties of each concept de- \nscribing various features and attributes of the concept (proper- \nties) and restrictions on properties. Ontologies are part of the \nW3C standard stack of the Semantic Web. 8 An ontology to- \ngether with a set of individual instances of classes constitutes a \nknowledge base and offer services to facilitate interoperability \nacross multiple heterogeneous systems and databases. \n•RDF. Resource Description Framework ( McBride, 2004 ) is a \nW3C recommendation that defines a language for describ- \ning resources on the web. RDF describes resources in terms \nof triples, consisting of a subject, predicate and object. RDF \nSchema (RDFS) ( Staab & Studer, 2013 ) describes vocabularies \nused in RDF descriptions. \n•OWL. The Ontology Web Language is used to define ontolo- \ngies on the Web, which extends RDF and RDFS, but adding a \nvocabulary. From a formal description, OWL is equivalent to a \nvery expressive description logic DL, where an ontology cor- \nresponds to a Tbox ( Gruber et al., 1993 ). In this sense, OWL- \nDL is syntactic description that gives maximum expressive- \nness while retaining computational completeness and decid- \nability ( McGuinness, Van Harmelen et al., 2004 ). In this work, \nwe use OWL-DL syntax summarized in Table 1 to formalize the \nproposed ontology. \n•SPARQL is a query language for easy access to RDF \nstores. It is the query language recommended by \nW3C ( Harris, Seaborne, & Prud’hommeaux, 2013 ) to work \nwith RDF graphs ( Prud, Seaborne et al., 2006 ), then supporting \nqueries and web data sources identified by URIs. \n•SWRL. The Semantic Web Rule Language provides the \nOWL-based ontologies with procedural knowledge, which \ncompensates for some of the limitations of ontology in- \nference, particularly in identifying semantic relationships \nbetween individuals ( Horrocks, Patel-Schneider, Bechhofer, \n& Tsarkov, 2005 ). SWRL uses the typical logic expres- \nsion “Antecedent ⇒ Consequent ”t o represent semantic rules. \nBoth antecedent (rule body) and consequent (rule head) \ncan be conjunctions of one or more atoms written as \n“atom 1 ∧ atom 2 ∧ /22c5/22c5/22c5∧ atom n ”. Each atom is attached to one or \nmore parameters represented by a question mark and a vari- \nable (e.g., ? x ). The most common uses of SWRL include trans- \nferring characteristics and inferring the existence of new indi- \nviduals ( Grosof & Poon, 2004 ). 9 \n8 https://www.w3.org/standards/semanticweb/ . \n9 https://www.w3.org/Submission/SWRL/ . \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 545 \n2.2. Related work \nIn the last decade, there have been appearing a series of stud- \nies in which ontological approaches are defined to express the \nknowledge domain in data mining and optimization algorithms. A \nrepresentative set of these works are compiled in a recent sur- \nvey ( Dou, Wang, & Liu, 2015 ), in which they are organized by \ncategories of algorithms and applications: association rule discov- \nery ( Marinica & Guillet, 2010 ), classification ( Allahyari, Kochut, & \nJanik, 2014 ) and clustering ( Jing, Ng, & Huang, 2010 ). In these ap- \nplications, semantics is used with different objectives, such as: to \nreduce the search space by specifying restrictions, to filter results \nin the post-processing stage, and to annotate the results of data \nmining processes. \nFollowing with this research line, some recent works include \nontologies to guide the processes in machine learning tasks. For \nexample, in Pinto, Scioscia, Loseto, and Ruta (2015) and Roldán- \nGarcía, García-Nieto, and Aldana-Montes (2017) , two different on- \ntologies are used in the classification process to infer incon- \nsistencies between concepts by means of semantic reasoning. \nIn Phan, Dou, Wang, Kil, and Piniewski (2015) , an ontology-driven \ndeep learning model is proposed to predict human behavior. \nIn the field of optimization, an interesting approach has been \nrecently proposed in Yaman, Hallawa, Coler, and Iacca (2017) , \nwhere the ECO ontology is defined to formally represent knowl- \nedge in evolutionary computation algorithms. This ontology can \nbe used for suggesting strategies for solving optimization prob- \nlems. At the same time, an OWL ontology has been pro- \nposed in Li, Yevseyeva, Basto-Fernandes, Trautmann, Jing, and \nEmmerich (2017) to model and systematize the knowledge of \npreference-based multi-objective evolutionary algorithms. These \nontologies are validated in use cases focused on algorithmic and \nparameter selection in academic problems. \nFrom a different point of view, a parallel line of research focuses \non defining ontologies for the semantic annotation of data analytic \nworkflows. The main objective is to model the input and output \nof algorithms involved in data mining and knowledge base discov- \nery (KDD) workflows to generate valid compositions. To this end, \nseveral OWL ontologies such as: KDDONTO ( Diamantini, Potena, & \nStorti ), DMWF ( Kietz, Serban, Bernstein, & Fischer, 2010 ) and KD \n( Záková, Kremen, Zelezny, & Lavrac, 2011 ), were proposed. How- \never, they did not describe the problem domain, or those basic \nconcepts (algorithm, type of analysis, task, dataset, attribute, etc.) \nthat can be combined to define entities or constraints. In fact, \nthese ontologies were not designed with the objective of opti- \nmizing the performance of the data mining algorithms, since they \ndo not offer detail enough to provide support to what is known \nas meta-learning. In Nguyen, Hilario, and Kalousis (2014) , meta- \nlearning is defined as the KDD procedure to improve performance \nin data mining processes, using information collected during the \nexperimentation phase of these algorithms. In this regard, the use \nof semantics is considered not only for the algorithmic composi- \ntion, but also for the improvement of data mining processes, taking \nadvantage of acquired knowledge from past experience. \nIn this context, the EU-FP7 European initiative e-LICO 10 pro- \nposed the DMOP ontology ( Keet et al., 2015 ), which is de- \nfined to support the analytic workflow composition by follow- \ning the standard CRISP-DM ( Shearer, 20 0 0 ). DMOP is used to de- \nfine analytical workflows, as well as to describe algorithms, pa- \nrameters, inputs/outputs and a large amount of meta-data in- \ncluded in typical data mining processes. A step further was taken \nby Kumara, Paik, Zhang, Siriweera, and Koswatte (2015) that use \n10 http://www.e-lico.eu/ . Automatic Service Composition to automate the analytic workflow \ngeneration. \nAs a summary, Table 2 outlines the main features of the related \nwork with regards to the semantic approach proposed here. These \nfeatures consist of specifying whether the existing approaches: fo- \ncus on data mining or optimization, are oriented to Big Data, pro- \nvide proof-of-concepts, align with other ontologies, use OWL/RDF \nin the semantic model and/or describe workflow composition \ntasks. Then, it is possible to identify the actual contributions of the \nproposed semantic model beyond the state of the art, as follows: \n•BIGOWL is conceived to semantically model data analytics in \nBig Data environments. Similarly to other ontologies in the \nliterature, it is oriented to general KDD procedures, although \nconsidering those Big Data ecosystem elements with class in- \nstances, e.g., ontology individuals. \n•It is aligned with the DMOP ontology, which is in turn aligned \nwith CRISP-DM. They have been validated to construct data \nmining workflows. \n•Besides data mining, BIGOWL is also focused on optimization \nalgorithms, although with special interest on covering multi- \nobjective metaheuristics in Big Data environments. \n•The proposed approach is validated on two real-world use- \ncases consisting of classical data mining and streaming data \nprocessing for multi-objective optimization. \n3. Current practices in Big Data analytics \nIn current Big Data technology ecosystems, when facing a spe- \ncific data analytic task, it is usual to support on already existing \ntools. Some of those consist in commercial services often provided \nthrough cloud computing Software-as-a-Service (SaaS), which can \nbe used by no skilled people by means of workflow compositions \n(e.g., Azure ML, Amazon ML, BigML, Data Mining Cloud Frame- \nwork, and Kognitio); other tools are open-source frameworks re- \nquiring skilled users who prefer to program their application using \nmore technical approaches. Additional factors (such as: data for- \nmat, data source, volume and velocity required to analyse data) are \nalso determinant when choosing the proper technology ( Zomaya & \nSakr, 2017 ). Hadoop ecosystem represents the most used frame- \nwork for developing distributed Big Data analytic applications. \nHowever, it is conceived for high skilled users, so even the stan- \ndard workflow composition service of Hadoop (Oozie) requires cer- \ntain programming ability to be properly used. \nBesides technological or commercial aspects, current Big Data \nplatforms still follow the common procedure when facing data an- \nalytics tasks ( ACM-SIGKDD, 2014 ), which comprises typical steps \nof classical KDD: data collection, data transformation, data mining, \npattern evaluation, and knowledge presentation. \nKeeping this in mind, the proposed semantic approach is ori- \nented to general KDD procedures, then leading the underlying \nBig Data technological platform to be semantically annotated with \nclass instances, e.g., individuals in the ontology. \n4. Semantic model \nOne of the main goals in this study is to capture all the needed \nsemantics to guide the smart design of Big Data analytics work- \nflows and to enhance their performance. For this reason, we opted \nto design an OWL 2 ontology to describe analytic algorithms, \ndatasets, problems, and workflows in the Big Data context. \nTo this end, the standard Ontology 101 development pro- \ncess ( Noy & McGuinness, 2001 ) has been followed, which com- \nprises seven steps: \n1. Determine the domain and scope of the ontology . The main scope \nof BIGOWL is data processing and data analytics in Big Data en- \n546 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 2 \nSummary ontologies’ features. \nFeature/Ontology CRISP-DM KDDONTO PMOEA ECO (Pinto’2015) (Phan’2015) DMWF KD DMOP BIGOWL \nData Mining /check /check /check /check /check /check /check /check \nOptimization /check /check /check \nBig Data environments /check \nProof of concepts /check /check /check /check /check /check /check /check \nAligned to other ontology /check /check \nOWL/RDF /check /check /check /check /check /check /check /check \nWorkflow composition /check /check /check /check \nFig. 1. Overview of the BIGOWL ontology. Continuous arrows refer to subclasses, whereas dotted ones refer to properties. \nvironments. This considers not only classical data analytic pro- \ncedures, but also specific data processing and underlying soft- \nware platform features oriented to Big Data. \n2. Consider reusing existing ontologies . As commented before, the \nproposed ontology is aligned with DMOP, which has been \nsuccessfully validated to construct data mining workflows. \nDMOP is in turn aligned with the foundational ontology \nDOLCE ( Masolo, Borgo, Gangemi, Guarino, & Oltramari, 2003 ) \nand follows the standard CRISP-DM in the definition of data \nmining processes. \n3. Enumerate important terms in the ontology . Important terms \nwere selected from the literature related to Big Data and op- \ntimization. In addition, terms from the ontologies aligned ( Keet \net al., 2015; Yaman et al., 2017 ) were also incorporated. Exam- \nples of such terms are: Component, Workflow, Task, Data, Dat- \naProcessing and Software . \n4. Define the classes and the class hierarchy . We have followed a \ntop-down approach in developing the class hierarchy. This fact \nfacilitates among others, the alignment with DMOP and DOLCE, \nthe design of annotation mappings and the use of a seman- \ntic reasoner. Fig. 1 shows the ontology core classes and hier- \narchy. For instance, the class Component has several subclasses, \nincluding DataAnalysing and DataCollection . Classes modeling al- \ngorithms, components and workflows are aligned with the class \ndmop:DataType . BIGOWL has been developed using Protégé11 \nand OWL 2. \n11 https://protege.stanford.edu/ . 5. Define the properties of classes and slots . With the purpose of \nrelating classes and defining attributes, we have included ob- \nject and data properties. A representative set of properties are \nshown in Table 3 , where the class Component is related to class \nAlgorithm by means of the object property hasAlgorithm . Data \nproperties of class Component are path, author, numberOfInputs \nand numberOfOutputs . \n6. Define the facets of the slots . This step includes the definition of \ncardinality constraints and value restrictions for the ontology’s \nproperties. For example, the range of the property order is re- \nstricted to integer (to specify in which step this task is carried \nout), when the class Task is its domain. \n7. Create instances . Instances or individuals in BIGOWL are \nspecific of the Big Data analytics domain. For exam- \nple, GeneratorDataTraffic is an instance of the class Kafka , \nwhich is a subclass of DataIngestion . The class Kafka has a \nproperty topicKafka (with range “string”) to indicate streams of \nrecords of Apache Kafka 12 services. \n4.1. The BIGOWL ontology \nBIGOWL has been developed following the steps described \nabove, producing 184 classes, 16 object properties (binary re- \nlationships between individuals), 20 data properties (individ- \nual attributes), 488 axioms, 66 individuals and growing. It is \nworth mentioning that classes DM-DataClass ≡DMDataClass and IO- \n12 Data Streaming Processing https://www.kafka.apache.org/ . \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 547 \nTable 3 \nComponent: object and data properties. \nObject properties Description logic \nhasAlgorithm ∃ hasAlgorithm.Thing /subsetsqequal Component \nhasParameter ∃ hasParameter.Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nisConnected ∃ isConnected.Thing /subsetsqequal Algorithm /2294Component /2294Task \nisCorrect ∃ isCorrect.Thing /subsetsqequal Algorithm /2294Component \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nnumberOfInputs ∃ numberOfInputs.Datatype Literal /subsetsqequal Algorithm /2294Component \nnumberOfOutputs ∃ numberOfOutputs.Datatype Literal /subsetsqequal Algorithm /2294Component \npath ∃ path.Datatype Literal /subsetsqequal IO-Class /2294Algorithm /2294Component \nTable 4 \nTask: object and data properties. \nObject properties Description logic \ncompatibleWith ∃ compatibleWith.Thing /subsetsqequal Task /latticetop /subsetsqequal ∀ compatibleWith.Task \nhasComponent /latticetop /subsetsqequal ∀ hasComponent.Component \nisConnected ∃ isConnected.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nData Properties Description Logic \norder ∃ order.Datatype Literal /subsetsqequal Task /latticetop /subsetsqequal ∀ order.Datatype \nClass ≡Data are declared as equivalent (with relation ≡) to align \nwith those classes from other ontologies (DMOP) that describe \nsimilar concepts. We use OWL-DL syntax (see Table 1 ) to formal- \nize the proposed ontology. The complete ontology is developed in \n“bigowl.owl ”fi l e and available in the GitHub repository. 13 \nA representative set of the main classes are described here, to- \ngether with their object and data properties. These classes are: \nComponent, Task, Algorithm, Data , and Workflow . Each class has de- \nfined a set of properties or conditions in order to be conceptual- \nized. That is, an individual that satisfies those properties is consid- \nered to be a member of that class. \n- Component . This class represents each processing step in the \nanalytic workflow. It is used to encapsulate one concrete function- \nality, its parameters and the corresponding inputs and outputs it \nconsiders. The class Component has four subclasses that are ori- \nented to define specific functionalities in typical data analytics pro- \ncessing chains: DataCollection , to connect to data sources; DataPro- \ncessing , to clean, curate, fuse and consolidate data; DataAnalysis , to \nperform the algorithmic function; and DataSink , to represent final \nsteps in the data flow, e.g., store and visualization. Table 3 con- \ntains the object and data properties defined for Component . In ac- \ncordance with these, a component can specify Input classes and \nOutput classes, to define the type of data it is accepting and gener- \nating, respectively. Therefore, a component can connect with other \none if their linking inputs and outputs are compatible among them. \n- Task . A task represents an instance of a component that is \nused in a workflow and can be run. As shown in Table 4 , the class \nTask has similar properties to those of Component , but including \nthe object property compatibleWith , to specify compatibility among \nconnected tasks, and the data property order , which indicates the \nspecific step of execution in which this task is scheduled, in the \nscope of the workflow. A Component is then a template for one or \nmore tasks, which will be used to carry out its specific functional- \nity in a workflow. \n13 URL link https://www.github.com/KhaosResearch/BIGOWL . - Algorithm . This class is devoted to cover all possible kinds It \nhas two main subclasses: DataMiningAlgorithm and OptimizationAl- \ngorithm ; which are used to distinguish between these two fami- \nlies of algorithms. The former one is included in form of equiv- \nalence with the class DM-Algorithm , which is linked from DMOP. \nThis way, all subclasses deriving from this class in DMOP are also \nused in BIGOWL. For the later, i.e., OptimizationAlgorithm , a new hi- \nerarchical classification of classes has been elaborated in this study \nfor the annotation of this family, which comprises: Exact, Heuristic , \nand Metaheuristic algorithms as main subclasses. \nTable 5 includes the object and data properties of Algorithm . \nAmong its main object properties it is worth mentioning: imple- \nments , which is referred to a learning model or search strategy; \nmanages , to annotate the type of data it works; and resolves , which \nis related to the Problem it is oriented to solve. This is a use- \nful mechanism to relate classes Algorithm and Problem , which also \nshare the data property dealWith that indicates the specific fea- \ntures an algorithm should fulfill to deal with a problem. \nIn this regard, the class Problem defines a series of data proper- \nties like: numberOfConstraints, numberOfObjectives, encodedBy , and \nnumberOfVariables , that will lead a future reasoner to recommend \nthe correct algorithm to solve it. These two classes have to be \ndeclared as DisjointWith , in order to avoid future inconsistencies \nwhen querying the annotated data in a workflow. \n- Data . The class Data is devoted to annotate all the data flow- \ning throughout the analytic workflow. It is declared as EquivalentTo \nIO-Class of DMOP. This aligning enables datatypes defined by third \nparties’ ontologies to be contextualized in the analysis. Table 6 \ncontains the main data properties defined for this class, namely: \npath , to annotate the origin of data; and hasDataType , which de- \nfines the relation with class DataType . This last is used to define \nthe type of data, i.e. PrimitiveType (Double, Integer, Boolean, etc.) \nor StructuredType (Graph, Tree, Matrix, Vector, Tuple, etc.). \n- Workflow . It is used to guide the correct orchestration of \nthose tasks involved in a data analysis job. Its main object prop- \nerties are hasTask and hasParameter , which are formally described \nin Table 7 . These properties are used by the workflow to obtain the \nexecution order, as well as the input/output specifications of each \n548 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 5 \nAlgorithm: object and data properties. \nObject properties Description logic \nhasComponent /latticetop /subsetsqequal ∀ hasComponent.Component \nhasParameter ∃ hasParameter.Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nspecifiesInputClass ∃ specifiesInputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nspecifiesOutputClass ∃ specifiesOutputClass.Thing /subsetsqequal Algorithm /2294Component /2294Task \nimplements Transitive Property implements ∃ implements.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ implements.Strategy \nmanages ∃ manages.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ manages.DataType \nresolves ∃ resolves.Thing /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ resolves.Problem \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nnumberOfInputs ∃ numberOfInputs.Datatype Literal /subsetsqequal Algorithm /2294Component \nnumberOfOutputs ∃ numberOfOutputs.Datatype Literal /subsetsqequal Algorithm /2294Component \ndealWith ∃ dealWith.Datatype Literal /subsetsqequal Algorithm /latticetop /subsetsqequal ∀ dealWith.Datatype \nTable 6 \nData: object and data properties. \nObject properties Description logic \nhasDataType ∃ hasDataType.Thing /subsetsqequal Parameter /2294Data /latticetop /subsetsqequal ∀ hasDataType.DataType \npath ∃ path.Datatype Literal /subsetsqequal IO-Class /2294Algorithm /2294Component \nTable 7 \nWorkflow: object and data properties. \nObject properties Description logic \nhasTask ∃ hasTask.Thing /subsetsqequal Workflow /latticetop /subsetsqequal ∀ hasTask.Task \nhasParameter ∃ hasParameter Thing /subsetsqequal Workflow /2294Algorithm /2294Component \nData Properties Description Logic \nauthor ∃ author.Datatype Literal /subsetsqequal Workflow /2294Algorithm /2294Component /2294Problem /2294Software \nhasDataValue ∃ hasDataValue.Datatype Literal /subsetsqequal DataType /2294IO-Class /2294Parameter /2294Workflow \n/2294Algorithm /2294Component /2294Problem \nisCorrectWorkflow ∃ isCorrectWorkflow.Datatype Literal /subsetsqequal Workflow /latticetop /subsetsqequal ∀ isCorrectWorkflow.Datatype \nnumTasks ∃ numTask.Datatype /subsetsqequal Workflow /latticetop /subsetsqequal ∀ numTask.Datatype \ntask. This information, together with the data properties numTasks \nand isCorrectWorkflow , is then used in reasoning time to check \nwhether the workflow is correctly composed or not, i.e., to address \nsemantic validation of the analytic workflow. \n4.2. Overall approach \nAn overview of the proposed semantic model is illustrated in \nFig. 2 , which is arranged together with the underlying operational \nmodel, hence enabling actual composition of analytic workflows. \nIn this approach, BIGOWL is the ontological scheme driving the \nwhole process. It is the terminological box (TBox) that defines the \nvocabulary with concepts and properties in the domain of Big Data \nanalysis. As explained before, BIGOWL is developed in OWL 2 ac- \ncording to which, concepts are represented by classes and relations \nare represented by data properties or object properties. As repre- \nsented in Fig. 2 , BIGOWL is conceived as an abstract top-level on- \ntology that enables not only subontology replication e.g., to focus \non specific use cases or algorithmic families, but also linkage with \nexternal domain knowledge ontologies, which are oriented to the \nspecific problem domain (Smart Cities, Biology, etc.). \nAt bottom-level, the Assertional Box (ABox) defines all the in- \nstances in the knowledge domain (in OWL 2 an instance is rep- \nresented by an individual) involving the analytic workflows’ meta- \ndata. These instances are stored in RDF triple format in a Stardog 14 \nrepository, which is a commercial version of the Pellet OWL 2 rea- \nsoner ( Sirin, Parsia, Grau, Kalyanpur, & Katz, 2007 ), but enhanced \nwith persistence capabilities. Once the ontology (Tbox) has been \n14 http://www.stardog.com/ . loaded together with SWRL rules, a series of reasoning tasks are \nlaunched by using the Stardog OWL 2 reasoner to derive new infor- \nmation that is not explicitly expressed in the knowledge base. The \nnew information will indicate, when applicable and among others, \nwhether an analytic workflow is correctly composed, or not. \nIn this model, the Annotation Module is used to populate the \nRDF repository with new instances that involve the required meta- \ndata (annotated) to be used in workflows, for example: algorithms, \noperators, parameters, input/output (paths), data sources, database \nconnections, data sinks, software, execution order, etc. \nThe Operational Model will make use of these annotated meta- \ndata for driving the workflow composition. In this process, each \nstep a new component is to be selected and used, a SPARQL query \nis launched to obtain the required meta-data and to suggest the \nnext possible component/s to be included. \nA very simple (hypothetical) case of use would comprise the \nfollowing steps: \n(i) A user desires to extract patterns from a dataset and visual- \nize the results; \n(ii) Then, the user selects one algorithm from a list of data \nmining algorithms (in form of analysis component) queried \nthroughout the semantic model; \n(iii) The selected algorithm requires specific input parameters \nand data to train, so the semantic model will supply them; \n(iv) The initial dataset should be then formatted in form of data \ncollection task; \n(v) In case collected data need transformation, an intermediate \ndata processing component is included between collection \nand analysis; \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 549 \nFig. 2. General overview of the semantic model that follows the ontology’s scheme of BIGOWL. The analytic operational model address the workflow composition driven by \nthe semantic model \n(vi) The semantic model will suggest suitable output component \n(visualization) to be linked after the analytic algorithm. \nIt is worth mentioning that each step in the workflow is instan- \ntiated by a task, which entails an execution order. Then, the entire \nworkflow is arranged according to all the ordering values in tasks. \nIn summary, the semantic model acts as a mediator between \ndata provider components and data consumers. It also acts as a \ndata source and meta-data registry with functions to make “agree- \nments” on the provision and traceability of the whole data value \nchain. \n5. Validation \nFor validation purposes, two different cases of study have been \ndeveloped to show how the proposed semantic approach is used \nfor driving the composition of data analytic workflows. The first \none is focused on Big Data streaming processing and optimiza- \ntion of real-world traffic routes in the domain of Smart Cities. The \nsecond case study is centered on classic data mining analysis on \nacademic problem instances, although considering local and cloud \ncomputing environments. In this way, we aim at covering, as much \nas possible, different aspects in Big Data applications: algorithmic \nanalyses (optimization and data mining), velocity and volume is- \nsues (streaming processing), real-world and academic data prob- \nlems, and Big Data ecosystems (Apache Spark local and on-premise \ncluster, BigML cloud SaaS API). \nIn these two cases, a similar semantic annotation and query- \ning procedure has been followed, which consists in the man- \nual annotation (guided by domain experts) of: algorithms, tech- \nnological/platform features, and attributes of problem domain of \nknowledge; and automatic querying by means of SPARQL sen- \ntences. To distinguish individuals belonging to each case study, \ntwo different namespaces has been defined, i.e. traffic: http:// \nwww.khaos.uma.es/perception/traffic/khaosteam# and weka: http: \n//www.khaos.uma.es/perception/weka/khaosteam# , respectively. 5.1. Case study 1: streaming processing of New York City traffic \nopen-data \nThe first case study consists in a dynamic version of the \nbi-objective Traveling Salesman Problem (TSP), to minimize the \n“travel time” and the “distance” to cover certain routing points \nin a urban area. The algorithm for solving it is a dynamic variant \nof the well-known multi-objective metaheuristic NSGA-II provided \nin jMetalSP ( Barba-González, García-Nieto, Nebro, Cordero, Durillo, \nNavas-Delgado, & Aldana-Montes, 2017 ), 15 which allows parallel \nprocessing of evaluation functions in Apache Spark environment. \nIn the case of the dynamic bi-objective TSP, which is formu- \nlated in terms of a distance matrix and a time travel matrix, the \nperiodic changes can affect any of them. Our particular dynamic \nTSP problem instance is based on real-world data. Specifically, it \nis feed from the Open Data API provided by the New York City \nDepartment of Transportation, 16 which updates traffic information \nseveral times per minute. The information is provided as a text file \nwhere each line includes the average speed to traverse the two end \npoints defining a link in the most recent interval. The goal is then, \ngiven a list of nodes in New York city and the distances between \neach pair of nodes, calculate the shortest possible route that visits \neach node. \nNew York’s traffic data is read periodically by an external appli- \ncation that writes a file in HDFS whenever new data are acquired, \nso we have implemented a streaming data component for that pur- \npose. This component reads periodically the new data appeared \nin the specific directory (this is done automatically by Spark) and \nmakes a simple processing: if a change in a link is detected (time \nor distance), then the corresponding problem matrices are up- \ndated. \nThe analysis of the streaming data sources can be carried out \nin parallel by using Spark. In fact, we used a Hadoop cluster com- \n15 https://www.github.com/jMetal/jMetalSP . \n16 https://www.data.cityofnewyork.us/Transportation/Real- Time- Traffic- Speed- Data/ \nxsat-x5sa . \n550 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nFig. 3. Workflow for dynamic bi-objective optimization of TSP problem instance with Open Data New York \nposed of 100 cores in the previous study where the Big Data op- \ntimization model was presented ( Barba-González et al., 2017 ). In \naddition, two other streaming data sources where used as sepa- \nrate components, which based on Twitter and Kafka. In the first \none, tweets are read from Twitter API with the topic “New York \ntraffic” and a processing of each tweet is simulated, so the prob- \nlem is updated in accordance with it (for testing purposes we set \nrandom changes in traffic scenario). This way, we combine a differ- \nent streaming source with the possibility of adjusting the process- \ning time, which will serve for performance evaluation purposes. In \nthe second source, the idea is to enrich the case study with an- \nother data source that will produce artificial data. Then we created \na Kafka message producer that generates, following uniform and \nnormal distributions, a series of random messages with data to up- \ndate the problem. Every 5 s at least 10 0 0 messages are produced, \nbut on average about 10,0 0 0 messages are created. Both the Twit- \nter and Kafka streaming source classes have the same behavior as \nthe HDFS based one: they iteratively collect and analyze the data \nto somehow update the problem. \nAfter data processing, the analytic task is then carried out, \nwhich entails dynamic optimization computed by NSGAII algo- \nrithm of the jMetalSP library. The results of the analysis are used \nto feed data sinks. In this case study, we consider two of them: \none that stores the produced Pareto fronts in HDFS, and other one \nthat visualizes information about the Pareto front approximation \n(as the number of solutions and the number of generated fronts) \nusing R-plot library. \nThe workflow implementing this case study is represented in \nFig. 3 , 17 where all the components are arranged according to data \nflow. In this workflow, the numeric indexes (1)–(7) correspond to \nthose steps as indicated in Table 8 , which contain the required \nSPARQL queries the semantic model apply to recommend forth- \ncoming component/s to use, in design time. For this case study, \nthe main set of individuals annotated in the semantic model and \ntheir relationships, are shown in Fig. 4 . Then it is possible to follow \nthe complete process step-by-step: \n•Step (1) . The workflow designer fetch all the optimization prob- \nlems from BIGOWL to select the implementation that better \nfits the required model for TSP instances. Interestingly, they are \nall subclasses of OptimizationProblem , which is integrated from \nDMOP. As a result, (s)he selects TSP. \n•Step (2) . Given a problem to solve, TSP in this case, the seman- \ntic model recommends a series of optimization algorithms that \ncould deal with it, i.e., those annotated algorithms that better \n17 Ontology instances available at https://www.github.com/KhaosResearch/ \nBIGOWL/blob/master/traffic.owl . adapt to the problem in terms of properties, such as: solution \nencoding, manages, dealWith , etc. After this, the designer selects \nNSGAII. \n•Step (3) . This is an intermediate step followed by the semantic \nmodel to recommend specific annotated component and task \ninstancing the underlying software that implements TSP and \nNSGAII. \n•Step (4) . Now, the objective of this query is to obtain the spe- \ncific data model to properly host data in problem and algorithm \ntasks. This step is thought to use specific domain knowledge \ninformation (traffic routes in this case) coming from external \nontologies. The resulting annotated instance here is MatrixNY , \nwhich refers to a data model comprising a matrix of points and \ndistances in the scenario of New York city. \n•Step (5) . Once the workflow designer has a clear idea about \nthe data model, (s)he can set data sources and connect them \nto feed the analysis. The semantic model is then queried to \nshow all possible data collectors, i.e., those previously anno- \ntated. Among all the resulting possibilities, ReadWebNYDataTraf- \nfic, DataCollectionDataTrafficKafka and DataCollectionTwitter are \nselected for this case study. \n•Step (6) . Before connecting data sources to analytic component, \na previous task is required for data processing and consolida- \ntion. In this case study, the corresponding component is im- \nplemented as a Spark processing task to join Kafka messages, \nTweets and traffic data streams. \n•Step (7) . Last steps usually correspond to data sink tasks to al- \nlocate results from analyses. For this case study, Visualization- \nTask and HDFSStoreTask are selected, which implement R-plot \nvisualization and storage in HDFS, respectively. \n•Step (8) . Finally, the semantic model is queried to obtain \nthe corresponding task instances that are mutually compati- \nble among them. The analytic workflow is now ready to be \nlaunched on the underlying running platform. \nMoreover, once the whole process is completed, a further rea- \nsoning procedure can now be started to check whether the gen- \nerated workflow is semantically consistent, or not. This reasoning \ntask will be explained in Section 5.3 . \n5.2. Case study 2: classification with Iris flower dataset \nAs commented before, the second case study consists in the \nacademic problem of Irish flower classification by means of deci- \nsion tree J48, a classical algorithm for data mining analytics. For \nmaterialization, two different approaches have been used in this \ncase: the well-known library for data mining Weka and the BigML \nSaaS API for analysis on-cloud. The aim is to illustrate how similar \nannotation and querying procedures with BIGOWL can be used to \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 551 \nTable 8 \nSPARQL queries for case study of streaming processing of New York city traffic open-data. \nStep SPARQL Result\n(1)SELECT DISTINCT ?problem WHERE {\n?problem rdf:type ?type .\n?type rdfs:subClassOf* dmop:OptimizationProblem .}TSP, ZDT1, ZDT2, ZDT3, ZDT4,\nZDT5, ZDT6, Kursawe..\n(2)SELECT DISTINCT ?algorithm\n(count(DISTINCT ?propertiesAlgorithm) AS numProperties)\nWHERE {\ntraffic:TSP bigowl:encodedBy ?solution.?algorithm rdf:type ?type.?type rdfs:subClassOf* bigowl:OptimizationAlgorithm.\n?entity bigowl:manages ?solution .\n?algorithm bigowl:dealWith ?propertiesAlgorithm .\ntraffic:TSP bigowl:hasFeature ?propertiesTSP .\nFILTER ( ?propertiesTSP in (?propertiesAlgorithm)).\n} GROUP BY ?algorithm ORDER BY DESC(?numProperties)NSGAII, MOCell,\nSMSEMOA,SPEA2, IBEA, PAES,\nPESA2, WASFGA\n(3)SELECT distinct ?comp ?task WHERE {\n?comp bigowl:hasProblem traffic:TSP .\n?comp bigowl:hasAlgorithm traffic:NSGAII .\n?comp rdf:type bigowl:Optimization .?task rdf:type bigowl:Task . ?task bigowl:hasComponent ?comp. }OptmimizationComponent,\nOptimizationTask\n(4)SELECT distinct ?data WHERE {\n?comp bigowl:hasProblem traffic:TSP .\n?comp bigowl:hasAlgorithm traffic:NSGAII .\n?comp rdf:type bigowl:Optimization .\n?task rdf:type bigowl:Task . ?task bigowl:hasComponent ?comp.\n?task bigowl:specifiesInputClass ?data . }MatrixNY\n(5)SELECT distinct ?dataCollection WHERE {\n?dataCollection rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataCollection.}ReadWebNYDataTraffic,\nDataCollectionHDFS,\nDataCollectionDataTrafficKafka,\nDataCollectionTwitter,\nDataCollectionDB, ...\n(6)SELECT distinct ?taskProcessing ?compProcessing WHERE {\n?taskCollection bigowl:hasComponent bigowl:ReadNYDataTraffic.\n?taskCollection bigowl:specifiesOutputClass ?out.\n?dataProcessing rdf:type ?typeProcessing .\n?typeProcessing rdfs:subClassOf* bigowl:DataProcessing.\n?taskProcessing bigowl:hasComponent ?dataProcessing .?taskProcessing bigowl:specifiesInputClass ?out.\n?taskProcessing bigowl:specifiesOutputClass traffic:MatrixNY. }SparkTask, ComponentSpark\n(7)SELECT distinct ?dataSink WHERE {\n?dataSink rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataSink.}VisualizationPlot,\nDataSinkHDFSStore,\nDataSinkOracleStore, ...\n(8)SELECT distinct ?task1 ?task2 WHERE {\n?task1 rdf:type bigowl:Task . ?task2 rdf:type bigowl:Task .\n?task1 bigowl:specifiesOutputClass ?output .\n?task2 bigowl:specifiesInputClass ?output . }GeneratorDataTrafficTask,\nSparkTask, TwitterCollectorTask,\nKafkaMGTask,\nReadNYDataTrafficTask,\nOptimizationTask, VisualizationTask\ncompose workflows on different platforms when solving the same \nproblem. \nFig. 5 shows the individuals (and their relationships) anno- \ntated in the ontology, and Fig. 6 18 represents graphically the an- \nalytic workflow for this case study. The numeric labels (1)–(5) are \n18 Ontology instances available at https://www.github.com/KhaosResearch/ \nBIGOWL/blob/master/weka.owl . aligned with their corresponding steps in Table 9 that contain the \nSPARQL queries used and their results. \nIn a nutshell, steps (1)–(3) are used to guide the workflow de- \nsigner on the selection of data model, algorithm, and analysis com- \nponents and tasks, respectively. Step (4) is used to query suit- \nable data collector components, in this case the designer selects \nDataCollectionBigML for BigML API instance and DataCollectorFS for \nWeka instance dataset. Step (5) queries are devoted to select possi- \nble data sink components, and specifically DataSinkFSStore and Vi- \n552 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nFig. 4. BIGOWL’s individuals annotated in the workflow for dynamic bi-objective optimization of TSP problem \nFig. 5. BIGOWL’s individuals in workflow for Irish flower classification with J48 decision tree instanced from Weka \nFig. 6. Workflow for Irish flower classification with J48 decision tree instanced from \nWeka and BigML. sualizationPlot , which implement orders to save results in file sys- \ntem and API method for plotting in BigML, respectively. Finally, \nstep (6) obtains the corresponding task instances that are mutu- \nally compatible among them throughout the complete workflow. \n5.3. Reasoning with BIGOWL \nReasoning procedure is built in BIGOWL with formulation of se- \nmantic rules on top of the OWL ontology, to deduce new informa- \ntion from the existing knowledge. These rules are formulated in \nSWRL and used to perform semantic reasoning jobs mainly de- \nvoted to check correctness of workflows, e.i., to discover those \ncomponents and tasks with (non-)compatible connectivity of in- \nputs/outputs, execution orders, data domains, data formats, data \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 553 \nTable 9 \nSPARQL queries for case study Irish flower classification on Weka, as well as on BigML. \nStep SPARQL Result\n(1)SELECT DISTINCT ?individual\nWHERE {\n?individual rdf:type ?type .\n?type rdfs:subClassOf* bigowl:DMDataClass .\n}Iris, Contact-lens, CPU, Diabetes,\nGlass, Ionosphre, Labor,\nReutersCorn, Segment,..\n(2)SELECT ?algorithm\nWHERE {\nweka:Iris rdf:type ?typeD .?typeD rdfs:subClassOf* ?classSomePropertyAlgorithm.?algorithm rdf:type ?type.?type rdfs:subClassOf* bigowl:DataMiningAlgorithm.\nbigowl:DataMiningAlgorithm rdfs:subClassOf* [\na owl:Restriction ;\nowl:onProperty bigowl:manages ;\nowl:someValuesFrom ?classSomePropertyAlgorithm ] .\n}J48, LogisticRegression, NaiveBayes,\nRepTree, IBk, LinearNNSearch,\nSMO, ...\n(3)SELECT distinct ?comp ?taskWHERE {\n?comp bigowl:hasAlgorithm weka:J48 .?task rdf:type bigowl:Task .\n?task bigowl:hasComponent ?comp. } ClassificationJ48Component,\nClassificationJ48Task\n(4)SELECT distinct ?dataCollection WHERE {\n?dataCollection rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataCollection.}DataCollectionOpenData,\nDataCollectionBigML,\nDataCollectionHDFS,\nDataCollectorFS, ...\n(5)SELECT distinct ?dataSink WHERE {\n?dataSink rdf:type ?type.\n?type rdfs:subClassOf* bigowl:DataSink.}VisualizationPlot,\nDataSinkHDFSStore,\nDataSinkOracleStore,\nDataSinkFSStore, ...\n(6)SELECT distinct ?task1 ?task2 WHERE {\n?task1 rdf:type bigowl:Task . ?task2 rdf:type bigowl:Task .\n?task1 bigowl:specifiesOutputClass ?output .?task2 bigowl:specifiesInputClass ?output . }ClassAsignerIrisTask,\nClassificationJ48Task,\nClassifierPerformanceEvaluatorTask,\nCrossValidaionFolderMarkerTask,\nTextViewerTask\ntypes, etc. SWRL rules are then evaluated by the reasoner after \nclassifying Big Data components in accordance with axioms, as de- \nfined in Table 1 . In concrete, there are two types of axioms associ- \nated with OWL-DL classes for reasoning, namely: subClassOf , which \nis used to define the necessary conditions for a class to be consid- \nered a member of a given OWL class; and equivalentClass , for an- \nnotating when two classes can be considered as equivalent, if they \ncomply the conditions. \nBIGOWL imports subClassOf axioms from DMOP to specify tax- \nonomy classification of Data Mining contexts and their data. In this \nsense, subclasses are also the natural way of describing hierarchy \nof algorithmic families and versions in optimization analyses. For \ninstance, Genetic Algorithms are subclasses of Evolutionary Algo- \nrithms and these in turn, are subclasses of Population Based Algo- \nrithms. This structural information is then considered in reasoning \ntime for algorithm recommendation. The main axioms for subclass \nclassification are defined in Table 10 , which correspond to Data \nMining and Optimization algorithmic families. \nFurthermore, a series of specific SWRL rules are described for \nassessing the compatibility of components. As commented before, \nthe main goal is to address the generation of well-formed Big Data \nworkflows. A description of these rules is as follows: - Compatibility between task, component and Data Mining \nalgorithm . This rule is used to check that input data model is com- \npatible with the task that is indeed an instance (or implementa- \ntion) of a component. In this specific case, the used component \nrefers to a Data Mining Algorithm to perform a specific analysis. \nIn short, this rule is used by the reasoner to validate compatibility \nbetween data mining component and data source. The result is a \npredicate indicating that data “feeding” the component are com- \npatible with the analytic algorithm, so a task can be launched to \nrun it on the underlying platform. \nbigowl:specifiesInputClass(?task, ?data) ˆ\nbigowl:hasComponent(?task, ?comp) ˆbigowl:hasAlgorithm(?comp, ?alg) ˆbigowl:DataMiningAlgorithm(?alg) ˆ\nbigowl:DMDataClass(?data)\n-> bigowl:isCorrect(?alg, ?data)\nNote that a similar rule is defined in the semantic model to \nconsider optimization algorithms. \n- Compatibility between tasks of a workflow . This rule is ap- \nplied to a complete workflow. It is used to check that input/output \ndata connections of each pair of consecutive tasks are “semanti- \n554 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nTable 10 \nOWL axioms for algorithmic subclass classification. \nClass Classification rule\nOptimization AlgorithmOptimizationAlgorithm subClassOf\n((implements some OptimizationStrategy) and\n(resolves some OptimizationProblem)) or Algorithm\nDataMining AlgorithmOptimizationAlgorithm subClassOf\n(manages some DMDataClass) or Algorithm\nOptimization ComponentOptimization subClassOf (hasAlgorithm only\n(OptimizationAlgorithm or MachineLearning))\nDataMining ComponentDataMining subClassOf (hasAlgorithm only\n(DataMiningAlgorithm or MachineLearning))\ncally” similar. The outcome is a new predicate indicating whether \neach two consecutive tasks are mutually compatible, or not. \nWorkflow(?w) ˆ\nbigowl:hasTask(?w, ?task1) ˆbigowl:order(?task1, ?ord1) ˆbigowl:hasTask(?w, ?task2) ˆ\nbigowl:order(?task2, ?ord2) ˆ\nswrlb:add(?ord2, ?ord1, 1) ˆbigowl:specifiesInputClass(?task2, ?data)ˆ\nbigowl:specifiesOutputClass(?task1, ?data)\n-> bigowl:compatibleWith(?task1, ?task2)\n- Connectivity between tasks and data . Similarly to the pre- \nvious one, this rule is used to indicate that two instances of tasks \nare properly linked, that is to say, it checks that the input data of \ntask2 are covered with the output data of task1 , according to \nthe execution order established in the workflow. \nWorkflow(?w) ˆbigowl:hasTask(?w, ?task1) ˆbigowl:order(?task1, ?ord1) ˆbigowl:hasTask(?w, ?task2) ˆ\nbigowl:order(?task2, ?ord2) ˆ\nswrlb:add(?ord2, ?ord1, 1) ˆbigowl:specifiesInputClass(?task2, ?data) ˆ\nbigowl:specifiesOutputClass(?task1, ?data)\n-> bigowl:isConnected(?task2, ?data)\n- Workflow correctness . Finally, this rule validates that all the \ncomponents, instanced by corresponding tasks and data sources, \nare correctly arranged and connected. The result is then a new \npredicate indicating whether the complete workflow is correct, or \nnot. \nWorkflow(?w) ˆbigowl:hasTask(?w, ?task) ˆbigowl:numberOfInput(?task, ?nIn) ˆbigowl:isConnected(?task, ?data).\nsqwrl:makeSet(?set, ?data) ˆ\nsqwrl:groupBy(?set, ?task).sqwrl:size(?cont, ?set) ˆ\nswrlb:equal(?cont, ?nIn)\n-> sqwrl:select(?cont, ?nIn, ?task) ˆbigowl:isCorrectWorkflow(?w, true)In summary, these case studies are used as a “proof of concept ”\nto somehow highlight that the proposed semantic model is able to \nsupport in the design of Big Data analytics. In this regard, BIGOWL \nenables automatic SPARQL querying for component recommenda- \ntion, as well as reasoning procedures for workflow validation. \n6. Discussions \nOne of the main research findings we claim with the design \nand implementation of BIGOWL is the ability to represent and con- \nsolidate knowledge involving Big Data analytics. This semantic ap- \nproach allows us to annotate (i.e. to “semantize”) all the meta- \ndata flowing from multiple data sources, processing components \nand analytic algorithms. The meta-data are integrated following \nthe BIGOWL structure and stored in an RDF repository. \nOn the one hand, the results obtained in the two case stud- \nies indicate that, driven by the ontological model, it is possible \nto progressively deliver component recommendations for the con- \nstruction of Big Data analytics workflows. The resulting workflows \nare indeed enhanced with semantic knowledge that explicitly de- \nscribes and registers the data lineage (data provenance in database \nsystems), from sources to results. It also would enable to replay \nspecific portions or inputs of the data flow for step-wise debug- \nging or regenerating lost outputs. In the BIGOWL semantic model, \ndata linage is mapped with RDF triples referring to records of the \ninputs, entities, systems, algorithms and processes that influence \ndata of interest, hence providing a historical record of the data ob- \ntained (as results) and its origins (as sources). \nBased on the analysis provided in the two cases studies, the \nuser is able to identify the correct path the data follow and how \nthey are modified to obtain added value, for a given domain of \nknowledge. For example, in the first case study, a series of data \nsources involving information about urban traffic in the city of \nNew York (with geo-locations, travel times, densities, tweets, etc.) \nare semantically related (or linked) to the results obtained, in form \nof optimized routes in a problem characterization of the classical \nTSP. In this case study, the outputs are encoded in form of routes, \nwhere the travel time and the routing distance are optimized. This \nway, the resulting routes are linked to the traffic densities and the \nTwitter messages, so the data lineage is registered with semantic \nannotations. \nSimilarly, in the second case study, it is possible to connect \nprediction accuracies with classification algorithms, for the Irish \nflower database. In addition, the running experiences acquired \nwhen using different execution frameworks, e.g., in-house/in-cloud, \nare also annotated as results. \nC. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 555 \nAnother important finding lies in the possibility of using the \nsemantic knowledge-base, now consolidated in the RDF repository, \nto perform reasoning tasks, hence to infer new knowledge. In this \nstudy, a series of SWRL rules are used to train the reasoner. In this \nstudy, a reasoner is used to evaluate a set of SWRL rules defined \nfor the specific task of workflow validation. In this regard, the val- \nidation analysis performed by the reasoner required 644 ms for \ncase study 1 and 673 ms for case study 2. Taking into account that \nwe used the Stardog OWL 2 reasoner, the time spent in reasoning \ntasks is acceptable for workflow validation. \nOn the other hand, the main constraint of the proposed seman- \ntic model is that it needs a domain ontology to cover the prob- \nlem knowledge domain. This domain ontology contains the spe- \ncific concepts for a given case, so it can be reused in domains \nwhere previous efforts provided such model. However, if such on- \ntology is not available, then its design is required. As explained \nin Section 4.1 , the class Data in BIGOWL is used, not only to an- \nnotate all the data flowing in the analytic workflow, but also to \nallow alignment with third parties’ ontologies covering the spe- \ncific problem domain of knowledge. Additionally, the general on- \ntology could miss concepts that would be needed in some cases \nand are not described in the current model. This constraint can be \nsolved by proposing an extension, in form of new version release \nof BIGOWL, though a collaborative portal. In this sense, BIGOWL is \npublicly available at WebProtégé, 19 where any registered user can \nintroduce changes. These changes will be reviewed in a regular ba- \nsis to approve or reject them. The last stable version of the ontol- \nogy will be provided in the project GitHub repository. 20 \nIn addition, a secondary constraint arises when a new workflow \nis generated or executed by a user, since a series of new annota- \ntions are required to store all the meta-data involved in the data \nanalytic process, in form of RDF triples. This makes the RDF repos- \nitory to increase significantly, which would promote, not only fu- \nture reasoning procedures to infer new knowledge from these data, \nbut also their connection with other Linked Data. In this sense, \nthe efficient management of large RDF repositories has become a \nchallenging task attracting many scholars to research ( Zomaya & \nSakr, 2017 ), which means a clear implication for academia. \nIn terms of practical implications, the proposed semantic model \nrepresents an initial demonstrator for the experimental piloting of \nBig Data frameworks enhanced with semantics. The objective is to \nobtain “Smart Data” and promote the data value chain in industry \nprocesses, which is a key challenge nowadays as reflected in the \nStrategic Research and Innovation Agenda of the Big Data Value As- \nsociation (EU SRIA 4.0 BDVA). 21 Several industrial projects in this \nassociation, like BigDataEurope 22 and BigOceanData, 23 are focused \non exploiting semantics in Big Data analytics, so they could par- \ntially take advantage of BIGOWL as reference ontological model. \n7. Conclusions \nIn this work, an ontological approach called BIGOWL is pro- \nposed to provide a conceptual framework for the annotation of \nBig Data analytics. The proposed semantic model is materialized \nby means of an RDF repository, and programmatic querying and \nreasoning functions. \nTo test the initial hypothesis, two case studies have been devel- \noped, which consist in: (1) real-world streaming traffic data pro- \ncessing for route optimization in urban environment, and (2) aca- \ndemic data mining classification on local/on-cloud platforms. The \n19 WebProtégé https://www.goo.gl/F6fYUc . \n20 GitHub https://www.github.com/KhaosResearch/BIGOWL . \n21 http://www.bdva.eu/sites/default/files/BDVA _ SRIA _ v4 _ Ed1.1.pdf . \n22 https://www.big- data- europe.eu/ . \n23 http://www.bigoceandata.com/ . experience on these cases revealed that BIGOWL approach is useful \nwhen integrating knowledge domain concerning a specific analytic \nproblem. Consequently, the integrated knowledge is used for guid- \ning the design of Big Data analytics workflows, by recommending \nnext components to be linked, and supporting final validation. \nIt is worthy to declare that the proposed semantic model is cur- \nrently populated with those annotated elements required to set the \ncase studies reported in this work, although it can be feed with \nnew instances regarding other Big Data workflows. \nThis motivates our future research agenda, which entails a \nfirst phase to provide automatic facilities for ontology population, \nhence to enrich the semantic approach; second, to provide new \nmechanisms to promote the use of contextual domain of knowl- \nedge in the generation of Big Data analytic solutions; and third, to \ngenerate new and heterogeneous use cases of analytics workflows \nthat would led us to find and solve new possible deficiencies, as \nwell as to enrich the knowledge base. \nReferences \nACM-SIGKDD (2014). Data mining curriculum. ACM SIGKDD 2006-04-30. Retrieved \n2014-01-27. \nAllahyari, M. , Kochut, K. , & Janik, M. (2014). Ontology-based text classification into \ndynamically defined topics. In 2014 IEEE international conference on semantic \ncomputing (pp. 273–278) . \nBarba-González, C. , García-Nieto, J. , Nebro, A. J. , Cordero, J. A. , Durillo, J. J. , \nNavas-Delgado, I. , et al. (2017). Jmetalsp: A framework for dynamic multi-ob- \njective big data optimization. Applied Soft Computing . In–Press–Online \nDiamantini, C., Potena, D., & Storti, E.. Ontology-driven kdd process composition. \nDou, D. , Wang, H. , & Liu, H. (2015). Semantic data mining: A survey of ontolo- \ngy-based approaches. In Semantic computing (icsc), 2015 ieee international con- \nference on (pp. 244–251). IEEE . \nGrosof, B. N. , & Poon, T. C. (2004). SweetDeal: Representing agent contracts with \nexceptions using semantic web rules, ontologies, and process descriptions. In- \nternational Journal of Electronic Commerce, 8 (4), 61–97 . \nGruber, T. R. (1995). Toward principles for the design of ontologies used for \nknowledge sharing? International Journal of Human-Computer Studies, 43 (5–6), \n907–928 . \nGruber, T. R. , et al. (1993). A translation approach to portable ontology specifica- \ntions. Knowledge Acquisition, 5 (2), 199–220 . \nHarris, S. , Seaborne, A. , & Prud’hommeaux, E. (2013). Sparql 1.1 query language. W3C \nRecommendation, 21 (10) . \nHorrocks, I. , Patel-Schneider, P. F. , Bechhofer, S. , & Tsarkov, D. (2005). OWL rules: \nA proposal and prototype implementation. Web Semantics: Science, Services and \nAgents on the World Wide Web, 3 (1), 23–40 . \nJing, L. , Ng, M. , & Huang, J. (2010). Knowledge-based vector space model for text \nclustering. Knowledge and Information Systems, 25 (1), 35–55 . \nKeet, C. , Ławrynowicz, A. , d’Amato, C. , Kalousis, A. , Nguyen, P. , & Palma, R. (2015). \nThe data mining optimization ontology. Web Semantics, 32 , 43–53 . \nKietz, J. , Serban, F. , Bernstein, A. , & Fischer, S. (2010). Data mining workflow tem- \nplates for intelligent discovery assistance and auto-experimentation. In Proceed- \nings- of the ecml/pkdd: 10 (pp. 1–12) . \nKonys, A. (2016). Ontology-based approaches to big data analytics. In International \nmulti-conference on advanced computer systems (pp. 355–365) . \nKuiler, E. W. (2014). From big data to knowledge: An ontological approach to big \ndata analytics. Review of Policy Research, 31 (4), 311–318 . \nKumara, B. T. G. S. , Paik, I. , Zhang, J. , Siriweera, T. H. A. S. , & Koswatte, K. R. C. (2015). \nOntology-based workflow generation for intelligent big data analytics. In 2015 \nieee international conference on web services (pp. 495–502) . \nLi, L. , Yevseyeva, I. , Basto-Fernandes, V. , Trautmann, H. , Jing, N. , & Em- \nmerich, M. (2017). Building and using an ontology of preference-based multi- \nobjective evolutionary algorithms. In H. Trautmann, G. Rudolph, K. Klamroth, \nO. Schütze, M. Wiecek, Y. Jin, & C. Grimme (Eds.), Evolutionary multi-criterion \noptimization: 9th international conference, EMO 2017, Münster, Germany, March \n19–22, 2017, proceedings (pp. 406–421). Cham: Springer International Publish- \ning . \nMarinica, C. , & Guillet, F. (2010). Knowledge-based interactive postmining of associa- \ntion rules using ontologies. IEEE Transactions on Knowledge and Data Engineering, \n22 (6), 784–797 . \nMasolo, C. , Borgo, S. , Gangemi, A. , Guarino, N. , & Oltramari, A. (2003). Wonderweb \ndeliverable d18, ontology library (final). ICT Project, 33052 . \nMcBride, B. (2004). The resource description framework (rdf) and its vocabulary de- \nscription language rdfs. In Handbook on ontologies (pp. 51–65). Springer . \nMcGuinness, D. L. , Van Harmelen, F. , et al. (2004). Owl web ontology language \noverview. W3C Recommendation, 10 (10), 2004 . \nNguyen, P. , Hilario, M. , & Kalousis, A. (2014). Using meta-mining to support data \nmining workflow planning and optimization. Journal of Artificial Intelligence Re- \nsearch, 51 , 605–644 . \nNoy, N. , & McGuinness, D. L. (2001). Ontology development 101: A guide to creating \nyour first ontology. Technical report . \n556 C. Barba-González et al. / Expert Systems With Applications 115 (2019) 543–556 \nNoy, N. F., McGuinness, D. L. et al. (2001). Ontology development 101: A guide to \ncreating your first ontology. \nPhan, N. , Dou, D. , Wang, H. , Kil, D. , & Piniewski, B. (2015). Ontology-based deep \nlearning for human behavior prediction in health social networks. In Proceed- \nings of the 6th ACM conference on bioinformatics, computational biology and health \ninformatics (pp. 433–442). ACM . \nPinto, A. , Scioscia, F. , Loseto, G. , Ruta, M. , Bove, E. , & Sciascio, E. D. (2015). A seman- \ntic-based approach for machine learning data analysis. In 2015 IEEE international \nconference on semantic computing (ICSC) (pp. 324–327) . \nPrud, E. , & Seaborne, A. (2006). Sparql query language for rdf. W3C Recommendation . \nRistoski, P. , & Paulheim, H. (2016). Semantic web in data mining and knowledge \ndiscovery: A comprehensive survey. Web Semantics: Science, Services and Agents \non the World Wide Web, 36 , 1–22 . \nRoldán-García, M. , García-Nieto, J. , & Aldana-Montes, J. F. (2017). Enhancing seman- \ntic consistency in anti-fraud rule-based expert systems. Expert Systems with Ap- \nplications, 90 (Supplement C), 332–343 . Shearer, C. (20 0 0). The crisp-dm model: The new blueprint for data mining. Journal \nof Data Warehousing, 5 (4), 13–22 . \nSirin, E. , Parsia, B. , Grau, B. C. , Kalyanpur, A. , & Katz, Y. (2007). Pellet: A practical \nowl-dl reasoner. Web Semantics: Science, Services and Agents on the WWW, 5 (2), \n51–53 . \nStaab, S. , & Studer, R. (2013). Handbook on ontologies . Springer Science & Business \nMedia . \nYaman, A. , Hallawa, A. , Coler, M. , & Iacca, G. (2017). Presenting the ECO: Evolution- \nary computation ontology. In European conference on the applications of evolu- \ntionary computation (pp. 603–619) . \nZáková, M. , Kremen, P. , Zelezny, F. , & Lavrac, N. (2011). Automating knowledge dis- \ncovery workflow composition through ontology-based planning. IEEE Transac- \ntions on Automation Science and Engineering, 8 (2), 253–264 . \nZomaya, A. Y. , & Sakr, S. (2017). Handbook of big data technologies (1st). Springer \nInternational Publishing . ",
"metadata": {
"filename": "BIGOWL2019.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\BIGOWL2019.pdf",
- "file_size": 2271408,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:36.608723",
- "content_length": 74849
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\BIGOWL2019.pdf",
+ "size": 2271408,
+ "source": "docs_to_import"
+ },
+ "id": "a2079249-0ae0-4430-8573-2c14b24a8efe"
},
- "036993c2-b385-494a-a35a-12b1a21af260": {
- "id": "036993c2-b385-494a-a35a-12b1a21af260",
- "content": "[Página 1]\nTesting of big data analytics systems by benchmark \n \nMingang Chen \nShanghai Key Laboratory of Computer Software Testing \nand Evaluating \nShanghai Development Center of Computer Software \nTechnology \nShanghai, China \ncmg@ssc.stn.sh.cn Wenjie Chen, Lizhi Cai \nShanghai Key Laboratory of Computer Software Testing \nand Evaluating \nShanghai Development Center of Computer Software \nTechnology \nShanghai, China \ncwj@ssc.stn.sh.cn, clz@ssc.stn.sh.cn \n \n \nAbstract —With the rapid development of big data \ntechnologies and applications, various big data analytics systems \nhave been released by open source communities and industry. So \ntesting and evaluating the overall performance of these big data \nanalytics systems has become an important research topic. The \npaper analyzes in detail the challenges of testing big data \nanalytics systems and proposes the method and strategies for the \ntesting. Furthermore, the paper presents two cases of testing big \ndata analytics systems by benchmark. \nKeywords—testing; big data; benchmark; TPC-DS; TPCx-\nBigBench. \nI. INTRODUCTION \nIn recent years, big data has become a hot topic for \ngovernments and enterprises, and it is considered as a new \ndriving force for innovation in the information era. This is \nbased on the following two facts: firstly, in the past ten years, \nthe speed of data generating is becoming faster and faster, and \nwe have already entered the big data era; secondly, big data \ncontains huge values, and has brought about revolutionary \ndevelopments in many fields, such as e-commerce, finance, \ntransportation, medical and health service, etc. \nHowever, the “3V” characteristics (volume, variety, and \nvelocity) of big data make challenges for data processing and \nanalytics. Recently, industry and academia have launched a \nvariety of big data analytics system to cope with the challenges, \nsuch as open source Apache Hive [1], Apache Spark [2], and \ncommercial Transwarp Inceptor, Cloudera Impala, IBM Big \nSQL and so on. More and more enterprises or organizations \nuse big data analytics system to build the business application \nand obtain decision support from data. Therefore, testing and \nevaluating big data analytics systems has become one of the \nimportant research subjects of the big data fields. \nTesting of big data analytics system mainly has the \nfollowing three roles. (1) We can verify the correctness of \nfunctionalities and the reliability of the big data analytics \nsystem before it is deployed and put to use. (2) We can carry \nout a fair comparison of the performance of different big data \nanalytics systems. (3) We can optimize the performance of big \ndata analytics systems by testing. \nPresently, testing of big data analytics system mainly uses \nbenchmarks, and by benchmark testing, we can analyze and evaluate the functionalities, performance, reliability, and \ncompatibility of the system. There are three categories of \nbenchmark in the testing of big data analytics systems. The first \ncategory is the micro benchmark. This category of benchmark \nprincipally aims at testing a certain component of the big data \nanalytics system thus is also called component-level \nbenchmark. Such as TeraSort can only be used to test the \nsystem’s performance for sorting text data, and GridMax can \nonly be used to test the performance of various MapReduce job \nin the Hadoop clusters. Therefore, the micro benchmark cannot \nevaluate the performances of big data analytics system entirely. \nThe second category is the comprehensive benchmark. This \ncategory of the benchmark can test more than one components \nof big data analytics system. For example, Hibench is a \ncomprehensive benchmark, and its workload including micro \nbenchmarks, web search, SQL query and machine learning [3]. \nThe third category is the application oriented benchmark, \nwhich is characterized by simulating the scenario of big data \napplications in the enterprise. TPC-DS is a benchmark for \ntesting big data decision support systems [4, 5]. TPCx-\nBigBench [6, 7] is the first end-to-end, application-level big \ndata benchmark based on TPC-DS. Due to the standardization \nand usability of TPC-DS and TPCx-BigBench, more and more \norganizations begin to use these two benchmarks to test, \nevaluate and compare the overall performance of big data \nanalytics systems. \nThis paper will discuss in detail the challenges of testing big \ndata analytics systems in Part II, and propose method and \nstrategies of how to test big data analytics systems in Part III. \nIn Part IV, two cases of testing will be presented, that is testing \nof Transwarp Inceptor by TPC-DS and performance \ncomparison of Hive and Spark SQL by TPCx-BigBench. In \naddition, some preliminary analysis will be made on how to \noptimize the performance of Spark SQL by benchmark testing. \nFinally, we conclude the paper in section V. \nII. THE CHALLENGES OF TESTING BIG DATA ANALYTICS SYSTEM \nDue to the “3V” characteristics of big data and the \ncomplexity of big data analytics system, this brings about \nchallenges for testing big data analytics system. \nFirst is the complexity of the technologies on big data \nanalytics system. It generally adopts distributed architectures, \nsuch as master-slave or peer-to-peer. And factors that will \nThis work was funded by Science and Technology Commission of \nShanghai Municipality Program (16511101202, 17411952800). \n2312018 IEEE International Conference on Software Testing, Verification and Validation Workshops\n0-7695-6432-1/18/$31.00 ©2018 IEEE\nDOI 10.1109/ICSTW.2018.00054\n\n[Página 2]\naffect the performance of the system under test are complex, \nsuch as network environment, hardware configurations, system \nconfiguration parameters, and virtualization etc. For instance, \nHadoop system has over 200 configuration parameters. \nSecond is the complexity of test datasets. The test datasets \nof big data analytics system need not only to meet the “3V” \ncharacteristics of big data but also to represent typical business \nscenes. \nThird are the challenges of testing methods and tools, such \nas the traditional testing tools can no longer be appropriate, \nlacking automatic testing methods and the customization of \ntesting and diagnosing schemes. Different modules in the big \ndata analytics require different testing techniques. For example, \nwe test the performance of Spark SQL by SQL’s queries, while \nwe test throughput and latency of Spark Streaming by loading \nstreaming data. \nFourth, the testing of big data analytics system requires \nmore professional and more comprehensive testing abilities. \nTesters not only need to have the testing expertise but also need \nto master the big data analysis and processing technology. For \nexample, testers need to know how to load data from Hadoop \nHDFS into a Hive table and verify if the loading is correct. \nIII. BENCHMARK TESTING METHOD OF BIG DATA ANALYTICS \nSYSTEM \nThe testing of big data analytics system with benchmark \ncan generally be divided into 6 phases, that is requirement \nanalysis for testing big data analytics systems, preparing the \ntesting environment, preparing the test datasets and workload, \nloading the test datasets, testing for the big data analytics \nsystem and analysis of the testing result, as shown in Fig.1. \n \nFig.1. Benchmark testing method of big data analytics system \nA. Requirement analysis of testing for big data analytics \nsystems \nThe phase of requirement analysis for testing big data \nanalytics systems is by and large same as traditional software \ntesting, including specifying the objects of testing, the purposes \nof testing, the environment of testing, the datasets of testing, \ntechnology and tools of testing and the risk of testing, etc. But the key point of testing big data analytics system is the \nperformance and reliability of the system. For example, how \nefficient is the system's processing and analysis of data with \nlarge-scale datasets? Whether tasks of data processing can be \nmigrated automatically or not when a node in the cluster goes \ndown? Will the data be lost in a distributed environment when \na node crashes? \nB. Preparing the testing environment \nIn order to test a big data analytics system, we need to \nprepare a cluster of distributed data storage and computing, at \nthe same time a sufficient storage space is required to store and \nanalyze the large-scale datasets. It is worth noticing that the \nstorage space here not only refers to the hard disk space but \nalso memory space, especially in testing Apache Spark, due to \nthe 60% occupation of memory is used for buffering RDD (the \ndata structure of Spark), so enough memory space should be set \napart for the testing program. We should be careful that, the \ntesting environment should be ensured “clean”. In other words, \nwe should ensure that there is no other applications running in \nthe cluster, the CPU and memory of the node in the cluster are \nboth at their minimum utilization. \nC. Preparing the test datasets and workload \nThe datasets for testing big data analytics system comes \nfrom two sources: one is the real data from business, such as \ndata from weblogs or database of business; the other is \nsimulated data generated by big data benchmarking tools. TPC-\nDS and TPCx-BigBench are two benchmarks that have been \nnominated in the industry. It should be noted that we should set \nappropriate data scale, data type, and data model according to \nthe requirement of the testing. The workload is the core of \nperformance testing of big data analytics system. It needs to \nreflect business scenarios and data analytical techniques. The \nworkload in TPC-DS or TPCx-BigBench is the set of queries to \nbe executed against the test datasets. \nD. Loading the test datasets \nDuring the phase of loading test datasets, we should verify \nif the data has been loaded correctly into the distributed storage \nsystem. For example, whether the data is loaded into the right \nHDFS storage directory? Is the size of data file correct? If the \ndata need to be loaded into the distributed database system, we \nshould verify if the data can be load into the table in the \ndatabase correctly. \nE. Testing of the big data analytics system \nTesting of the big data analytics system needs to focus on \nsystem’s functionality, performance, reliability, and \ncompatibility. \n1) Functionality testing \nThe Functionality testing of big data analytics system mainly \nverifies whether functions of the system in data storage, data \nprocessing, data I/O etc. are correct? For example, whether \ndata processing based on MapReduce is correct? Whether the \nresults of SQL queries on the SQL-On-Hadoop system are \ncorrect? And whether the data I/O is complete? \n232\n\n[Página 3]\n2) Performance testing \nThe performance testing of big data analytics system needs \nto test the performance of data I/O, data processing and \nanalytic and the performance of SQL query on the system and \nso on. For example, we can test the reading and writing \nperformance of Hadoop HDFS using single large data file or \nmultiple large data files. For SQL-On-Hadoop systems, the \nperformance of SQL query is the most important performance \nmetric. \n3) Reliability testing \nThe reliability testing of big data analytics system needs to \nfocus on the following two aspects: \n• If the task can be automatically migrated when a task \nof data analytics failed at a certain node (may be due to lack of \nmemory), so as to ensure the task is executed correctly? \n• If one or some nodes in the cluster go down, will the \ntask of the data analytic be executed correctly due to the fault-\ntolerant mechanism of the system? \n4) Compatibility Testing \nThe compatibility testing of big data analytics system needs \nto verify the compatibility of the file system, the compatibility \nof data storage format, the compatibility of SQL syntax and so \non. \nF. Analysis of the testing result \nDuring the phase of analysis of the testing result, we need \nto analyze system’s testing metrics (functionality, performance, \nreliability, and compatibility) comprehensively according to the \ntesting requirement and finish the testing report. \nIV. CASES OF TESTING BIG DATA ANALYTICS \nAccording to the test method described in Part ċ, in this \nsection, we present two cases of testing big data analytics \nsystem. \nA. Testing for Transwarp Inceptor by TPC-DS \n1) Requirement analysis of testing Transwarp Inceptor \nThe purpose of testing Transwarp Inceptor is to verify the \nfunctionality of ETL ˈand evaluate the performance of SQL \nquery and compatibility of SQL syntax through automated \ntesting scripts. The method of testing follows the TPC-DS \nspecification. \n2) The system under test and environment \na) Transwarp Inceptor big data analytics system \nInceptor is a commercial big data analytics system \ndeveloped by Transwarp Technology Co., Ltd. It provides \nhigh-speed SQL analytics based Apache Spark. It can help \nbusinesses to build high-speed, scalable data warehouses, and \nperform interactive analysis, real-time reporting, and \nvisualization of data. Transwarp Inceptor has a three-tier \nstructure from bottom to top: the storage layer, the distributed computing engine layer and the interface layer, as is shown in \nFig.2. \n \nFig.2. Architecture of Transwarp Inceptor \nb) Test environment \nThe test environment consists of four physical servers, and \nthe configurations of servers are same, as is shown in Table I. \nFour servers make up a Transwarp cluster through Gigabit \nnetwork. \nTABLE I. THE HARDWARE CONFIGURATION OF THE TESTING SERVERS \n Node1 Node2 Node3 Node4 \nModel Dell PowerEdge R720 \nCPU Intel(R) Xeon(R) CPU E5-2620 v2 @ 2.10GHz \n ( 2 CPU x 6 cores) \nMemory \n(GB) 256 256 256 256 \nStorage 24 TB HDD hard drive \nOperating \nSystem Red Hat Enterprise Linux 6.5 \nHadoop Transwarp DataHub v3.4 Hadoop 2.2 \nInceptor Transwarp Inceptor v4.0 \nRoles Primary \nNameNode, \nInceptor Server, \nDataNode Secondary \nNameNode, \nInceptor \nMetaStore, \nDataNode \nDataNode \nDataNode \n3) Generating test datasets and workload by TPC-DS \na) TPC-DS \nTPC-DS is testing benchmark for decision support system \nproposed by TPC (Transaction Processing Performance \nCouncil). TPC-DS models the decision support functions of a \nretail product supplier. The business model of benchmark \nsimulates sales and returns of the three main channels (stores, \nonline retailers, and catalogs). The business model contains 7 \nfact tables and 17 dimension tables, and tables are organized by \nstar and snowflake mixed model. A reduced business model of \nTPC-DS is shown in Fig.3. \n233\n\n[Página 4]\nFig.3. TPC-DS database schema \nTPC-DS allows users to generate the different scale of \ndatasets from 100G to 100T according to the user’s test \nrequirements and test environment. In general, the TPC-DS \nbenchmark has following characteristics: \n• A large amount of business data and test cases (SQL \nqueries) can answer real business problems. \n• A total of 99 SQL queries follow the SQL 99 and SQL \n2003 core syntax standard, and SQL queries are \ncomplex. \n• The test cases include a variety of business models, \nsuch as interactive query, statistical analysis, iterative \nOLAP and data mining. \n• Almost all of the test cases need high I/O loading and \nCPU computing. \nb) The generation of test datasets and workload \nIn this phase, we use the data generation and query \ngeneration tools (DSTools v1.3.0) provided by the TPC-DS \nbenchmark to generate 500GB test datasets and 99 SQL \nqueries through automated shell scripts, and the script fragment \nis as follows. \n# Generate 500GB test datasets in the specified HDFS directory \n1: dbgen2 -scale 500 -dir HDFS_LOCATION \n \n# Generate 99 queries compatible with Oracle syntax for 500GB \ndatasets through the query template \n2: qgen2 –query99.tpl –directory QUERY_TEMPLATE –dialect \noracle -scale 500 \nThe 500GB test datasets consist of 24 tables of the database \n(7 fact tables and 17 dimension tables) mentioned above. The \n99 SQL queries implement business intelligence by answering \nreal business questions. \n4) Data loading \nIn the data loading phase, we first create 24 tables in \nTranswarp Inceptor to build the data warehouse for testing. The \nschemas of tables are provided by the TPC-DS benchmark. \nThen we load the datasets that have been generated in the \nHDFS into tables. The following script fragment shows how to \nload datasets in HDFS into the inventory table. \n# load inventory.dat into the inventory table \n1: LOAD DATA inpath '/tpc_ds/data/inventory.dat' INTO TABLE \ninventory; \n5) Testing for Transwarp Inceptor \nThe core of the TPC-DS based benchmark testing is the \nexecution of 99 SQLs one by one. In testing, we verify the \ncorrectness of the test results and record the execution time of \nSQL. We execute 99 SQLs with automated scripts by three rounds and take the average time of three rounds as SQL’s \nexecution time. The following script fragment shows how to \nexecute 99 SQL queries sequentially in Transwarp Inceptor. \n# Execute all 99 SQL queries one by one \n1: for(i = 1; i<=99; i++ ){ \n2: sql = \"query\"+ i + \".sql\"; \n3: system( \"transwarp -t -h localhost -f ./sql/\" + sql); \n4:}\n6) Testing Analysis \nIn the case of the 500GB test datasets, the four categories \nof SQL execution time are shown in Table II. Test results \nshow that 96 out of 99 SQL queries can be run directly in \nTranswarp Inceptor. There only 3 SQL queries need minor \nmodification to be compatible with SQL compiler of \nTranswarp Inceptor. Considering that the TPC-DS \nspecification allows SQL’s minor modification, so Transwarp \nInceptor has good compatibility with SQL 2003 standard. \nTABLE II. SQL QUERIES ’ EXECUTION TIME OF TRANSWARP INCEPTOR \nSQL \nCategories The number \nof SQL The total \nexecution time \n(seconds) The average \nexecution time \n(seconds) \nInteractive \nquery 9 197 21.9 \nStatistical \nanalysis 69 7705 111.7 \nIterative OLAP 10 4232 423.2 \nData mining 11 3502 318.4 \nB. Testing Hive vs. Spark SQL by TPCx-BigBench \n1) Requirement analysis of te sting Hive vs. Spark SQL \nTesting Hive vs. Spark SQL has two purposes. One is to \nutilize TPCx-BigBench as a benchmark for evaluating and \ncomparing the performance of two SQL-On-Hadoop analytics \nsystems. The other is to tune system parameters for optimizing \nanalytics system’s performance. \n2) Systems under test and test environment \na) Hive \nHive is one of the first data analytics engines to be built on \ntop of MapReduce. It was originally developed by Facebook to \nsupport data analysts to analyze large datasets in Hadoop by \nqueries in a SQL-like declarative query language. This SQL-\nlike language is called HiveQL and is based on the SQL \nlanguage, but does not strictly follow the SQL 99 standard. \nHive has now become the foundation of new SQL on Hadoop \nprojects, such as Impala, Presto, and Spark SQL. Hive \nmetadata has become the de facto standard for users to store \nand manage metadata (table names, column names, and types, \netc.) in Hadoop ecosystem. \nAlthough Hive is a widely used project, historically its \nbiggest drawback has been performance. Most of the \nperformance problems can be attributed to Hive's use of \nMapReduce as its execution engine. MapReduce is not a good \nchoice for running ad hoc, interactive queries. The main reason \n234\n\n[Página 5]\nis that MapReduce reads and writes to disk extensively, and \nthere is a high startup cost for MapReduce jobs. \nb) Spark SQL \nApache Spark is a cluster computing platform designed to \nbe fast and general-purpose. Spark extends the popular \nMapReduce model to efficiently support more types of \ncomputations, including interactive queries and stream \nprocessing. One of the main features of Spark is to be able to \nrun computing in memory, so Spark has faster computing \nspeed than MapReduce. \nSpark SQL [8] is the component that Spark uses to \nmanipulate structured data. It allows querying data via SQL as \nwell as the HiveSQL and it supports many sources of data, \nincluding Hive tables, Parquet, and JSON. Spark SQL is fully \ncompatible with Hive. Spark SQL supports HiveSQL and Hive \nmetastore, so we can compare the performance of Hive and \nSpark SQL under the same test datasets. \nSpark SQL also seamlessly integrates with Spark machine \nlearning libraries MLlib and Spark ML. For example, in a \nmachine learning application, the DataFrame API provided by \nSpark SQL can easily be used for data cleaning and feature \nengineering. \nc) Test environment \nThe test environment is a Cloudera Data Hub (CDH) \ncluster with 4 nodes connected directly through Gigabit \nnetwork, and detail hardware and software are shown in Table \nIII. Cloudera CDH 5.10 with default configurations was used \nfor all tests. \nTABLE III. TEST ENVIRONMENT FOR TESTING HIVE VS . SPARK \n Node1 Node2 Node3 Node4 \nCPU Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz (8 cores) \nMemory \n(GB) 64 80 80 80 \nStorage 4TB HDD hard drive \nOperating \nSystem CentOS 6.7 x86_64 \nHadoop Cloudera Data Hub 5.10.0 (Hadoop 2.6.0) \nHive Hive 1.1.0 \nSpark Spark 2.1.0 (--driver-memory 10g –execuotr-memory 20g ) \nRoles HDFS \nNameNode, \nResourceManager HDFS DataNode \nNodeManager \n3) Generating test datasets and workload by TPCx-\nBigBench \nBigBench covers the “3Vs” characteristics of the big data \nsystem. The initial implementation of BigBench was at the \nTeradata Aster platform in 2014. Later on, BigBench was \nstandardized by TPC in Nov. 2016, and TPC released TPCx-\nBigBench v1.2.0 as the benchmark for big data analytics \nsystem. BigBench benchmark consists of the data model, the \ndata generator and the specification of the workload. \na) Data model of BigBench The data model of BigBench includes structured data, semi-\nstructured data, and unstructured data, as shown in Fig.4. The \nstructured data of BigBench is adapted from TPC-DS. The \nsemi-structured data is composed of clicks made by customers \nand guest users visiting the retailer’s website. The unstructured \ndata is covered by product reviews submitted by actual \ncustomers or guest users. Therefore, BigBench satisfies the \n“variety” property of big data. \n \nFig.4. Data model of TPCx-BigBench \nb) Data generator of BigBench \nThe data generator of BigBench is based on an extension of \nPDGF [9] and allows generating data in accordance with the \ndata model. It can not only generate the structured data but also \ngenerate the semi-structured and unstructured data. PDGF is a \nparallel data generator that is capable of generating large \namounts data based on a scale factor. So, the “volume” \nproperty of big data is reflected in BigBench. In addition, the \n“velocity” property of big data is implemented through a \nperiodic refreshing scheme that continually adds new data to \ndifferent tables in the data model. The following script \nfragment shows how to set data storage directory and generate \n50GB datasets parallel by BigBench. \n# Set dataset’s HDFS storage path in userSettings.conf \n1: export BIG_BENCH_HDFS_ABSOLUTE_PATH \n=\"/user/$BIG_BENCH_USER\" \n2: export BIG_BENCH_HDFS_RELATIVE_HOME \n=\"benchmarks/bigbench\" \n# Generate 50GB test datasets with TPCx-BigBench \n1: $INSTALL_DIR/bin/bigBench runBenchmark –f 50 –m 8 –i \nDATA_GENERATION \n-f \n-m [number of map tasks for data generation] \n-i \nc) Query workload of BigBench \nThe BigBench query workload includes 30 queries, which \nare defined as questions about the business model. Ten of them \nhave been taken from the TPC-DS workload. The other 20 \nqueries were adapted from a McKinsey big data use cases and \nopportunities report. The 30 queries of BigBench can be \nclassified from two aspects: data types and analysis methods, \nas shown in Table IV and Table V. Analysis methods can be \ngrouped into four categories: Pure Hive Queries(Pure HQL), \nHive Queries with MapReduce programs, Hive Queries using \nnatural language processing(NLP/UDF/UDTF), and Queries \nusing Apache Spark MLLIB(Machine Learning). \n235\n\n[Página 6]\nTABLE IV. DATA TYPES OF BIGBENCH ’S WORKLOAD \nData type Queries Number \nStructured data query1,query6, query7, query9, \nquery11, query13, query14, query15, \nquery16, query17, query20, query21, \nquery22, query23, query24, query25, \nquery26, query29 18 \nSemi-structured data query2, query3, query4, query5, \nquery8, query12, query30 7 \nUnstructured data query10, query18, query19, query27, \nquery28 5 \nTABLE V. ANALYTIC METHOD OF BIGBENCH ’S WORKLOAD \nAnalytic method Queries Number \nPure HQL query6, query7, query9, query11, \nquery12, query13, query14, query15, \nquery16, query17, query21, query22, \nquery 23, query 24 14 \nMapReduce query2, query3, query4, query8, \nquery30 5 \nMachine Learning query5, query20, query25, query26, \nquery28 5 \nNLP/UDF/UDTF query1, query10, query18, query19, \nquery27, query29 6 \n4) Data Loading in BigBench \nData loading in BigBench refers to load test datasets into \nHive tables. The following script fragment shows how to load \ntest datasets created in the phase of “DATA_ GENERATION” \ninto Hive tables. We can verify whether data loading was \nsuccessful or not by Hive’s shell command. \n# Load test datasets into Hive tables \n1: $INSTALL_DIR/bin/bigBench runBenchmark –i LOAD_TEST \n \n# Verify the test datasets was loaded successfully \n2: hive> use bigbench; \n3: hive> show tables; \n5) Testing for Hive vs. Spark SQL \nIn order to compare the performance of Hive and Spark \nSQL, we use Hive engine and Spark engine respectively. We \nexecute 30 queries in sequence to compare the execution time, \nas shown in Table Ď. T h e s c r i p t f r a g m e n t i s a s f o l l o w s . I t i s \nworth noting that before using Spark engine we need to ensure \nthat Spark had access to the tables in Hive. \n# Test Hive performance with BigBench \n1: $INSTALL_DIR/bin/bigBench runBenchmark –i POWER_TEST \n \n# Test Spark SQL performance with BigBench \n2: $INSTALL_DIR/bin/bigBench runBenchmark –i POWER_TEST \n–e spark_sql \nTABLE VI. EXECUTION TIME FOR ALL QUERIES WITH SF 50(50G DATA ) \nQuery No. Analytic method Execution time (seconds) \nHive Spark SQL \nquery1 UDF/UDTF 296 124 \nquery2 MapReduce 3904 1634 \nquery3 MapReduce 1046 568 \nquery4 MapReduce 3932 989 \nquery5 Machine Learning 535 344 \nquery6 Pure HQL 603 238 \nquery7 Pure HQL 897 260 query8 MapReduce 680 251 \nquery9 Pure HQL 1123 138 \nquery10 NLP/UDF/UDTF 1133 1868 \nquery11 Pure HQL 242 110 \nquery12 Pure HQL 271 146 \nquery13 Pure HQL 361 152 \nquery14 Pure HQL 93 92 \nquery15 Pure HQL 151 124 \nquery16 Pure HQL 823 236 \nquery17 Pure HQL 230 118 \nquery18 NLP/UDF/UDTF 1066 903 \nquery19 NLP/UDF/UDTF 401 317 \nquery20 Machine Learning 341 322 \nquery21 Pure HQL 613 175 \nquery22 Pure HQL 160 128 \nquery23 Pure HQL 254 145 \nquery24 Pure HQL 307 118 \nquery25 Machine Learning 483 350 \nquery26 Machine Learning 249 291 \nquery27 NLP/UDF/UDTF 121 201 \nquery28 Machine Learning 456 510 \nquery29 UDF/UDTF 237 154 \nquery30 UDF/UDTF/MapReduce 3769 922 \n6) Performance analysis of Hive vs. Spark SQL \nAccording to Table Ď, Fig.5 and Fig.6, Spark SQL \nperformance is 1-8 times that of Hive under 14 Pure HQL \nqueries and 5 Hive queries with MapReduce. The main reason \nis that Spark SQL uses memory computing and optimized SQL \nengine. So Spark SQL is more efficient than Hive that uses \nMapReduce as a computing engine. \n \nFig.5. Hive and Spark SQL performance comparison by Pure HQL query \n \nFig.6. Hive and Spark SQL performance comparison by MapReduce query \n236\n\n[Página 7]\nFor machine learning workload 㸪Hive and Spark SQL are \nsimilar in performance, since both Hive and Spark SQL use \nSpark MLLIB as a machine learning engine, as shown in Fig.7. \n \nFig.7. Hive and Spark SQL performance comparison by machine learning \nSince NLP programs were written in the Python language, \nneither Hive nor Spark SQL can take advantage of the system’s parallel computing features. As a result, for NLP/UDF/UDTF workload, Hive and Spark SQL performance’s gap is not large, and Hive outperformed Spark SQL even on query 10 and query 27, as shown in Fig.8. \n \nFig.8. Hive and Spark SQL performance comparison by NLP/UDF/UDTF \nFor query10, we modify the parameter of \nspark.sql.shuffle.partition from the default of 200 to 50 to \noptimize the performance of Spark SQL. In Spark SQL, a large number of shuffle partitions means more tasks when shuffle operation occurs. More tasks in Spark SQL will increase the overhead of tasks startup and decrease the performance of the system. As shown in Fig.9, by optimizing Spark SQL’s parameter, query 10 reduce its run time from 1868 seconds to 1376 seconds. \n \nFig.9. Spark SQL performance improvement through optimization \nV. CONCLUSION \nWith the continuous development of big data applications \nand technologies, industry and academia pay more and more attention to the benchmark testing of big data analytics systems. It not only equitably compares the performance of multiple big data analytics systems, but also allows you to tune system parameters and optimize system performance. The paper analyzes the challenges of testing big data analytics system and summarizes the methods and strategies of the test. And the paper presents two cases of benchmark testing for big data analytics systems. In case 1, we present an automated system testing solution for Transwarp Inceptor by TPC-DS in detail, and the test includes system’s functionality, performance, reliability and compatibility of SQL. In case 2, we test and compare the performance of Hive and Spark SQL by TPCx-BigBench, an application oriented end-to-end benchmark. Test results show that the performance of Spark SQL significantly better than Hive on the workload of pure HQL and query with MapReduce. In the future, we will further research new technologies of big data benchmarks [10], such as testing and evaluation of streaming analytics and graph analytics systems. \n \nR\nEFERENCES \n \n[1] A.Thusoo, J.S. Sarma, N. Jain, et al, “Hive-a petabyte scale data \nwarehouse using hadoop”, IEEE 26th International Conference on Data \nEngineering. IEEE, 2010, pp.996-1005. \n[2] M. Zaharia, M. Chowdhury, M. J. Franklin, S. Shenker, et al, “Spark: \nCluster Computing with Working Sets”, Usenix Conference on Hot \nTopics in Cloud Computing, Boston, USA, 2010. \n[3] S. Huang, J. Huang, J. Dai, et al, “The HiBench Benchmark Suite: \nCharacterization of the MapReduce-Based Data Analysis”. ICDE \nWorkshops, 2010, pp. 41 - 51. \n[4] R. O. Nambiar, M. Poess, “The making of TPC-DS”, Proceedings of the \n32nd international conference on Very large data bases. VLDB \nEndowment, 2006, pp.1049-1058. \n[5] M. Poess, R. O. Nambiar, D. Walrath,“Why you should run TPC-DS: a \nworkload analysis”, Proceedings of the 33rd international conference on \nVery large data bases. VLDB Endowment, 2007, pp.1138-1149. \n[6] A. Ghazal, T. Rabl, M. Hu, et al, “BigBench: towards an industry \nstandard benchmark for big data analytics”, Proceedings of the 2013 ACM SIGMOD international conference on Management of data, 2013, \npp.1197-1208. \n237\n\n[Página 8]\n[7] TPCx-BigBench Standard Specification Version 1.2.0, November 2016, \nhttp://www.tpc.org/ \n[8] M. Armbrust, R. S. Xin, C. Lian, et al, “Spark sql: Relational data \nprocessing in spark”, Proceedings of the 2015 ACM SIGMOD \nInternational Conference on Management of Data, 2015, pp.1383-1394. \n[9] T. Rabl, M. Frank, H. M. Sergieh, et al, “A Data Generator for Cloud-\nScale Benchmarking”, TPCTC, 2010, pp.41-56. [10] T. Rabl, M. Frank, M. Danisch, et al, “The vision of BigBench 2.0”, \nProceedings of the Fourth Workshop on Data analytics in the Cloud., \nACM, 2015. \n \n \n \n238",
+ "2b2f0d65-1bc3-407f-b86d-119120dfb357": {
+ "content": "Testing of big data analytics systems by benchmark \n \nMingang Chen \nShanghai Key Laboratory of Computer Software Testing \nand Evaluating \nShanghai Development Center of Computer Software \nTechnology \nShanghai, China \ncmg@ssc.stn.sh.cn Wenjie Chen, Lizhi Cai \nShanghai Key Laboratory of Computer Software Testing \nand Evaluating \nShanghai Development Center of Computer Software \nTechnology \nShanghai, China \ncwj@ssc.stn.sh.cn, clz@ssc.stn.sh.cn \n \n \nAbstract —With the rapid development of big data \ntechnologies and applications, various big data analytics systems \nhave been released by open source communities and industry. So \ntesting and evaluating the overall performance of these big data \nanalytics systems has become an important research topic. The \npaper analyzes in detail the challenges of testing big data \nanalytics systems and proposes the method and strategies for the \ntesting. Furthermore, the paper presents two cases of testing big \ndata analytics systems by benchmark. \nKeywords—testing; big data; benchmark; TPC-DS; TPCx-\nBigBench. \nI. INTRODUCTION \nIn recent years, big data has become a hot topic for \ngovernments and enterprises, and it is considered as a new \ndriving force for innovation in the information era. This is \nbased on the following two facts: firstly, in the past ten years, \nthe speed of data generating is becoming faster and faster, and \nwe have already entered the big data era; secondly, big data \ncontains huge values, and has brought about revolutionary \ndevelopments in many fields, such as e-commerce, finance, \ntransportation, medical and health service, etc. \nHowever, the “3V” characteristics (volume, variety, and \nvelocity) of big data make challenges for data processing and \nanalytics. Recently, industry and academia have launched a \nvariety of big data analytics system to cope with the challenges, \nsuch as open source Apache Hive [1], Apache Spark [2], and \ncommercial Transwarp Inceptor, Cloudera Impala, IBM Big \nSQL and so on. More and more enterprises or organizations \nuse big data analytics system to build the business application \nand obtain decision support from data. Therefore, testing and \nevaluating big data analytics systems has become one of the \nimportant research subjects of the big data fields. \nTesting of big data analytics system mainly has the \nfollowing three roles. (1) We can verify the correctness of \nfunctionalities and the reliability of the big data analytics \nsystem before it is deployed and put to use. (2) We can carry \nout a fair comparison of the performance of different big data \nanalytics systems. (3) We can optimize the performance of big \ndata analytics systems by testing. \nPresently, testing of big data analytics system mainly uses \nbenchmarks, and by benchmark testing, we can analyze and evaluate the functionalities, performance, reliability, and \ncompatibility of the system. There are three categories of \nbenchmark in the testing of big data analytics systems. The first \ncategory is the micro benchmark. This category of benchmark \nprincipally aims at testing a certain component of the big data \nanalytics system thus is also called component-level \nbenchmark. Such as TeraSort can only be used to test the \nsystem’s performance for sorting text data, and GridMax can \nonly be used to test the performance of various MapReduce job \nin the Hadoop clusters. Therefore, the micro benchmark cannot \nevaluate the performances of big data analytics system entirely. \nThe second category is the comprehensive benchmark. This \ncategory of the benchmark can test more than one components \nof big data analytics system. For example, Hibench is a \ncomprehensive benchmark, and its workload including micro \nbenchmarks, web search, SQL query and machine learning [3]. \nThe third category is the application oriented benchmark, \nwhich is characterized by simulating the scenario of big data \napplications in the enterprise. TPC-DS is a benchmark for \ntesting big data decision support systems [4, 5]. TPCx-\nBigBench [6, 7] is the first end-to-end, application-level big \ndata benchmark based on TPC-DS. Due to the standardization \nand usability of TPC-DS and TPCx-BigBench, more and more \norganizations begin to use these two benchmarks to test, \nevaluate and compare the overall performance of big data \nanalytics systems. \nThis paper will discuss in detail the challenges of testing big \ndata analytics systems in Part II, and propose method and \nstrategies of how to test big data analytics systems in Part III. \nIn Part IV, two cases of testing will be presented, that is testing \nof Transwarp Inceptor by TPC-DS and performance \ncomparison of Hive and Spark SQL by TPCx-BigBench. In \naddition, some preliminary analysis will be made on how to \noptimize the performance of Spark SQL by benchmark testing. \nFinally, we conclude the paper in section V. \nII. THE CHALLENGES OF TESTING BIG DATA ANALYTICS SYSTEM \nDue to the “3V” characteristics of big data and the \ncomplexity of big data analytics system, this brings about \nchallenges for testing big data analytics system. \nFirst is the complexity of the technologies on big data \nanalytics system. It generally adopts distributed architectures, \nsuch as master-slave or peer-to-peer. And factors that will \nThis work was funded by Science and Technology Commission of \nShanghai Municipality Program (16511101202, 17411952800). \n2312018 IEEE International Conference on Software Testing, Verification and Validation Workshops\n0-7695-6432-1/18/$31.00 ©2018 IEEE\nDOI 10.1109/ICSTW.2018.00054\n\naffect the performance of the system under test are complex, \nsuch as network environment, hardware configurations, system \nconfiguration parameters, and virtualization etc. For instance, \nHadoop system has over 200 configuration parameters. \nSecond is the complexity of test datasets. The test datasets \nof big data analytics system need not only to meet the “3V” \ncharacteristics of big data but also to represent typical business \nscenes. \nThird are the challenges of testing methods and tools, such \nas the traditional testing tools can no longer be appropriate, \nlacking automatic testing methods and the customization of \ntesting and diagnosing schemes. Different modules in the big \ndata analytics require different testing techniques. For example, \nwe test the performance of Spark SQL by SQL’s queries, while \nwe test throughput and latency of Spark Streaming by loading \nstreaming data. \nFourth, the testing of big data analytics system requires \nmore professional and more comprehensive testing abilities. \nTesters not only need to have the testing expertise but also need \nto master the big data analysis and processing technology. For \nexample, testers need to know how to load data from Hadoop \nHDFS into a Hive table and verify if the loading is correct. \nIII. BENCHMARK TESTING METHOD OF BIG DATA ANALYTICS \nSYSTEM \nThe testing of big data analytics system with benchmark \ncan generally be divided into 6 phases, that is requirement \nanalysis for testing big data analytics systems, preparing the \ntesting environment, preparing the test datasets and workload, \nloading the test datasets, testing for the big data analytics \nsystem and analysis of the testing result, as shown in Fig.1. \n \nFig.1. Benchmark testing method of big data analytics system \nA. Requirement analysis of testing for big data analytics \nsystems \nThe phase of requirement analysis for testing big data \nanalytics systems is by and large same as traditional software \ntesting, including specifying the objects of testing, the purposes \nof testing, the environment of testing, the datasets of testing, \ntechnology and tools of testing and the risk of testing, etc. But the key point of testing big data analytics system is the \nperformance and reliability of the system. For example, how \nefficient is the system's processing and analysis of data with \nlarge-scale datasets? Whether tasks of data processing can be \nmigrated automatically or not when a node in the cluster goes \ndown? Will the data be lost in a distributed environment when \na node crashes? \nB. Preparing the testing environment \nIn order to test a big data analytics system, we need to \nprepare a cluster of distributed data storage and computing, at \nthe same time a sufficient storage space is required to store and \nanalyze the large-scale datasets. It is worth noticing that the \nstorage space here not only refers to the hard disk space but \nalso memory space, especially in testing Apache Spark, due to \nthe 60% occupation of memory is used for buffering RDD (the \ndata structure of Spark), so enough memory space should be set \napart for the testing program. We should be careful that, the \ntesting environment should be ensured “clean”. In other words, \nwe should ensure that there is no other applications running in \nthe cluster, the CPU and memory of the node in the cluster are \nboth at their minimum utilization. \nC. Preparing the test datasets and workload \nThe datasets for testing big data analytics system comes \nfrom two sources: one is the real data from business, such as \ndata from weblogs or database of business; the other is \nsimulated data generated by big data benchmarking tools. TPC-\nDS and TPCx-BigBench are two benchmarks that have been \nnominated in the industry. It should be noted that we should set \nappropriate data scale, data type, and data model according to \nthe requirement of the testing. The workload is the core of \nperformance testing of big data analytics system. It needs to \nreflect business scenarios and data analytical techniques. The \nworkload in TPC-DS or TPCx-BigBench is the set of queries to \nbe executed against the test datasets. \nD. Loading the test datasets \nDuring the phase of loading test datasets, we should verify \nif the data has been loaded correctly into the distributed storage \nsystem. For example, whether the data is loaded into the right \nHDFS storage directory? Is the size of data file correct? If the \ndata need to be loaded into the distributed database system, we \nshould verify if the data can be load into the table in the \ndatabase correctly. \nE. Testing of the big data analytics system \nTesting of the big data analytics system needs to focus on \nsystem’s functionality, performance, reliability, and \ncompatibility. \n1) Functionality testing \nThe Functionality testing of big data analytics system mainly \nverifies whether functions of the system in data storage, data \nprocessing, data I/O etc. are correct? For example, whether \ndata processing based on MapReduce is correct? Whether the \nresults of SQL queries on the SQL-On-Hadoop system are \ncorrect? And whether the data I/O is complete? \n232\n2) Performance testing \nThe performance testing of big data analytics system needs \nto test the performance of data I/O, data processing and \nanalytic and the performance of SQL query on the system and \nso on. For example, we can test the reading and writing \nperformance of Hadoop HDFS using single large data file or \nmultiple large data files. For SQL-On-Hadoop systems, the \nperformance of SQL query is the most important performance \nmetric. \n3) Reliability testing \nThe reliability testing of big data analytics system needs to \nfocus on the following two aspects: \n• If the task can be automatically migrated when a task \nof data analytics failed at a certain node (may be due to lack of \nmemory), so as to ensure the task is executed correctly? \n• If one or some nodes in the cluster go down, will the \ntask of the data analytic be executed correctly due to the fault-\ntolerant mechanism of the system? \n4) Compatibility Testing \nThe compatibility testing of big data analytics system needs \nto verify the compatibility of the file system, the compatibility \nof data storage format, the compatibility of SQL syntax and so \non. \nF. Analysis of the testing result \nDuring the phase of analysis of the testing result, we need \nto analyze system’s testing metrics (functionality, performance, \nreliability, and compatibility) comprehensively according to the \ntesting requirement and finish the testing report. \nIV. CASES OF TESTING BIG DATA ANALYTICS \nAccording to the test method described in Part ċ, in this \nsection, we present two cases of testing big data analytics \nsystem. \nA. Testing for Transwarp Inceptor by TPC-DS \n1) Requirement analysis of testing Transwarp Inceptor \nThe purpose of testing Transwarp Inceptor is to verify the \nfunctionality of ETL ˈand evaluate the performance of SQL \nquery and compatibility of SQL syntax through automated \ntesting scripts. The method of testing follows the TPC-DS \nspecification. \n2) The system under test and environment \na) Transwarp Inceptor big data analytics system \nInceptor is a commercial big data analytics system \ndeveloped by Transwarp Technology Co., Ltd. It provides \nhigh-speed SQL analytics based Apache Spark. It can help \nbusinesses to build high-speed, scalable data warehouses, and \nperform interactive analysis, real-time reporting, and \nvisualization of data. Transwarp Inceptor has a three-tier \nstructure from bottom to top: the storage layer, the distributed computing engine layer and the interface layer, as is shown in \nFig.2. \n \nFig.2. Architecture of Transwarp Inceptor \nb) Test environment \nThe test environment consists of four physical servers, and \nthe configurations of servers are same, as is shown in Table I. \nFour servers make up a Transwarp cluster through Gigabit \nnetwork. \nTABLE I. THE HARDWARE CONFIGURATION OF THE TESTING SERVERS \n Node1 Node2 Node3 Node4 \nModel Dell PowerEdge R720 \nCPU Intel(R) Xeon(R) CPU E5-2620 v2 @ 2.10GHz \n ( 2 CPU x 6 cores) \nMemory \n(GB) 256 256 256 256 \nStorage 24 TB HDD hard drive \nOperating \nSystem Red Hat Enterprise Linux 6.5 \nHadoop Transwarp DataHub v3.4 Hadoop 2.2 \nInceptor Transwarp Inceptor v4.0 \nRoles Primary \nNameNode, \nInceptor Server, \nDataNode Secondary \nNameNode, \nInceptor \nMetaStore, \nDataNode \nDataNode \nDataNode \n3) Generating test datasets and workload by TPC-DS \na) TPC-DS \nTPC-DS is testing benchmark for decision support system \nproposed by TPC (Transaction Processing Performance \nCouncil). TPC-DS models the decision support functions of a \nretail product supplier. The business model of benchmark \nsimulates sales and returns of the three main channels (stores, \nonline retailers, and catalogs). The business model contains 7 \nfact tables and 17 dimension tables, and tables are organized by \nstar and snowflake mixed model. A reduced business model of \nTPC-DS is shown in Fig.3. \n233\n \nFig.3. TPC-DS database schema \nTPC-DS allows users to generate the different scale of \ndatasets from 100G to 100T according to the user’s test \nrequirements and test environment. In general, the TPC-DS \nbenchmark has following characteristics: \n• A large amount of business data and test cases (SQL \nqueries) can answer real business problems. \n• A total of 99 SQL queries follow the SQL 99 and SQL \n2003 core syntax standard, and SQL queries are \ncomplex. \n• The test cases include a variety of business models, \nsuch as interactive query, statistical analysis, iterative \nOLAP and data mining. \n• Almost all of the test cases need high I/O loading and \nCPU computing. \nb) The generation of test datasets and workload \nIn this phase, we use the data generation and query \ngeneration tools (DSTools v1.3.0) provided by the TPC-DS \nbenchmark to generate 500GB test datasets and 99 SQL \nqueries through automated shell scripts, and the script fragment \nis as follows. \n# Generate 500GB test datasets in the specified HDFS directory \n1: dbgen2 -scale 500 -dir HDFS_LOCATION \n \n# Generate 99 queries compatible with Oracle syntax for 500GB \ndatasets through the query template \n2: qgen2 –query99.tpl –directory QUERY_TEMPLATE –dialect \noracle -scale 500 \nThe 500GB test datasets consist of 24 tables of the database \n(7 fact tables and 17 dimension tables) mentioned above. The \n99 SQL queries implement business intelligence by answering \nreal business questions. \n4) Data loading \nIn the data loading phase, we first create 24 tables in \nTranswarp Inceptor to build the data warehouse for testing. The \nschemas of tables are provided by the TPC-DS benchmark. \nThen we load the datasets that have been generated in the \nHDFS into tables. The following script fragment shows how to \nload datasets in HDFS into the inventory table. \n# load inventory.dat into the inventory table \n1: LOAD DATA inpath '/tpc_ds/data/inventory.dat' INTO TABLE \ninventory; \n5) Testing for Transwarp Inceptor \nThe core of the TPC-DS based benchmark testing is the \nexecution of 99 SQLs one by one. In testing, we verify the \ncorrectness of the test results and record the execution time of \nSQL. We execute 99 SQLs with automated scripts by three rounds and take the average time of three rounds as SQL’s \nexecution time. The following script fragment shows how to \nexecute 99 SQL queries sequentially in Transwarp Inceptor. \n# Execute all 99 SQL queries one by one \n1: for(i = 1; i<=99; i++ ){ \n2: sql = \"query\"+ i + \".sql\"; \n3: system( \"transwarp -t -h localhost -f ./sql/\" + sql); \n4:}\n6) Testing Analysis \nIn the case of the 500GB test datasets, the four categories \nof SQL execution time are shown in Table II. Test results \nshow that 96 out of 99 SQL queries can be run directly in \nTranswarp Inceptor. There only 3 SQL queries need minor \nmodification to be compatible with SQL compiler of \nTranswarp Inceptor. Considering that the TPC-DS \nspecification allows SQL’s minor modification, so Transwarp \nInceptor has good compatibility with SQL 2003 standard. \nTABLE II. SQL QUERIES ’ EXECUTION TIME OF TRANSWARP INCEPTOR \nSQL \nCategories The number \nof SQL The total \nexecution time \n(seconds) The average \nexecution time \n(seconds) \nInteractive \nquery 9 197 21.9 \nStatistical \nanalysis 69 7705 111.7 \nIterative OLAP 10 4232 423.2 \nData mining 11 3502 318.4 \nB. Testing Hive vs. Spark SQL by TPCx-BigBench \n1) Requirement analysis of te sting Hive vs. Spark SQL \nTesting Hive vs. Spark SQL has two purposes. One is to \nutilize TPCx-BigBench as a benchmark for evaluating and \ncomparing the performance of two SQL-On-Hadoop analytics \nsystems. The other is to tune system parameters for optimizing \nanalytics system’s performance. \n2) Systems under test and test environment \na) Hive \nHive is one of the first data analytics engines to be built on \ntop of MapReduce. It was originally developed by Facebook to \nsupport data analysts to analyze large datasets in Hadoop by \nqueries in a SQL-like declarative query language. This SQL-\nlike language is called HiveQL and is based on the SQL \nlanguage, but does not strictly follow the SQL 99 standard. \nHive has now become the foundation of new SQL on Hadoop \nprojects, such as Impala, Presto, and Spark SQL. Hive \nmetadata has become the de facto standard for users to store \nand manage metadata (table names, column names, and types, \netc.) in Hadoop ecosystem. \nAlthough Hive is a widely used project, historically its \nbiggest drawback has been performance. Most of the \nperformance problems can be attributed to Hive's use of \nMapReduce as its execution engine. MapReduce is not a good \nchoice for running ad hoc, interactive queries. The main reason \n234\nis that MapReduce reads and writes to disk extensively, and \nthere is a high startup cost for MapReduce jobs. \nb) Spark SQL \nApache Spark is a cluster computing platform designed to \nbe fast and general-purpose. Spark extends the popular \nMapReduce model to efficiently support more types of \ncomputations, including interactive queries and stream \nprocessing. One of the main features of Spark is to be able to \nrun computing in memory, so Spark has faster computing \nspeed than MapReduce. \nSpark SQL [8] is the component that Spark uses to \nmanipulate structured data. It allows querying data via SQL as \nwell as the HiveSQL and it supports many sources of data, \nincluding Hive tables, Parquet, and JSON. Spark SQL is fully \ncompatible with Hive. Spark SQL supports HiveSQL and Hive \nmetastore, so we can compare the performance of Hive and \nSpark SQL under the same test datasets. \nSpark SQL also seamlessly integrates with Spark machine \nlearning libraries MLlib and Spark ML. For example, in a \nmachine learning application, the DataFrame API provided by \nSpark SQL can easily be used for data cleaning and feature \nengineering. \nc) Test environment \nThe test environment is a Cloudera Data Hub (CDH) \ncluster with 4 nodes connected directly through Gigabit \nnetwork, and detail hardware and software are shown in Table \nIII. Cloudera CDH 5.10 with default configurations was used \nfor all tests. \nTABLE III. TEST ENVIRONMENT FOR TESTING HIVE VS . SPARK \n Node1 Node2 Node3 Node4 \nCPU Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz (8 cores) \nMemory \n(GB) 64 80 80 80 \nStorage 4TB HDD hard drive \nOperating \nSystem CentOS 6.7 x86_64 \nHadoop Cloudera Data Hub 5.10.0 (Hadoop 2.6.0) \nHive Hive 1.1.0 \nSpark Spark 2.1.0 (--driver-memory 10g –execuotr-memory 20g ) \nRoles HDFS \nNameNode, \nResourceManager HDFS DataNode \nNodeManager \n3) Generating test datasets and workload by TPCx-\nBigBench \nBigBench covers the “3Vs” characteristics of the big data \nsystem. The initial implementation of BigBench was at the \nTeradata Aster platform in 2014. Later on, BigBench was \nstandardized by TPC in Nov. 2016, and TPC released TPCx-\nBigBench v1.2.0 as the benchmark for big data analytics \nsystem. BigBench benchmark consists of the data model, the \ndata generator and the specification of the workload. \na) Data model of BigBench The data model of BigBench includes structured data, semi-\nstructured data, and unstructured data, as shown in Fig.4. The \nstructured data of BigBench is adapted from TPC-DS. The \nsemi-structured data is composed of clicks made by customers \nand guest users visiting the retailer’s website. The unstructured \ndata is covered by product reviews submitted by actual \ncustomers or guest users. Therefore, BigBench satisfies the \n“variety” property of big data. \n \nFig.4. Data model of TPCx-BigBench \nb) Data generator of BigBench \nThe data generator of BigBench is based on an extension of \nPDGF [9] and allows generating data in accordance with the \ndata model. It can not only generate the structured data but also \ngenerate the semi-structured and unstructured data. PDGF is a \nparallel data generator that is capable of generating large \namounts data based on a scale factor. So, the “volume” \nproperty of big data is reflected in BigBench. In addition, the \n“velocity” property of big data is implemented through a \nperiodic refreshing scheme that continually adds new data to \ndifferent tables in the data model. The following script \nfragment shows how to set data storage directory and generate \n50GB datasets parallel by BigBench. \n# Set dataset’s HDFS storage path in userSettings.conf \n1: export BIG_BENCH_HDFS_ABSOLUTE_PATH \n=\"/user/$BIG_BENCH_USER\" \n2: export BIG_BENCH_HDFS_RELATIVE_HOME \n=\"benchmarks/bigbench\" \n# Generate 50GB test datasets with TPCx-BigBench \n1: $INSTALL_DIR/bin/bigBench runBenchmark –f 50 –m 8 –i \nDATA_GENERATION \n-f \n-m [number of map tasks for data generation] \n-i \nc) Query workload of BigBench \nThe BigBench query workload includes 30 queries, which \nare defined as questions about the business model. Ten of them \nhave been taken from the TPC-DS workload. The other 20 \nqueries were adapted from a McKinsey big data use cases and \nopportunities report. The 30 queries of BigBench can be \nclassified from two aspects: data types and analysis methods, \nas shown in Table IV and Table V. Analysis methods can be \ngrouped into four categories: Pure Hive Queries(Pure HQL), \nHive Queries with MapReduce programs, Hive Queries using \nnatural language processing(NLP/UDF/UDTF), and Queries \nusing Apache Spark MLLIB(Machine Learning). \n235\nTABLE IV. DATA TYPES OF BIGBENCH ’S WORKLOAD \nData type Queries Number \nStructured data query1,query6, query7, query9, \nquery11, query13, query14, query15, \nquery16, query17, query20, query21, \nquery22, query23, query24, query25, \nquery26, query29 18 \nSemi-structured data query2, query3, query4, query5, \nquery8, query12, query30 7 \nUnstructured data query10, query18, query19, query27, \nquery28 5 \nTABLE V. ANALYTIC METHOD OF BIGBENCH ’S WORKLOAD \nAnalytic method Queries Number \nPure HQL query6, query7, query9, query11, \nquery12, query13, query14, query15, \nquery16, query17, query21, query22, \nquery 23, query 24 14 \nMapReduce query2, query3, query4, query8, \nquery30 5 \nMachine Learning query5, query20, query25, query26, \nquery28 5 \nNLP/UDF/UDTF query1, query10, query18, query19, \nquery27, query29 6 \n4) Data Loading in BigBench \nData loading in BigBench refers to load test datasets into \nHive tables. The following script fragment shows how to load \ntest datasets created in the phase of “DATA_ GENERATION” \ninto Hive tables. We can verify whether data loading was \nsuccessful or not by Hive’s shell command. \n# Load test datasets into Hive tables \n1: $INSTALL_DIR/bin/bigBench runBenchmark –i LOAD_TEST \n \n# Verify the test datasets was loaded successfully \n2: hive> use bigbench; \n3: hive> show tables; \n5) Testing for Hive vs. Spark SQL \nIn order to compare the performance of Hive and Spark \nSQL, we use Hive engine and Spark engine respectively. We \nexecute 30 queries in sequence to compare the execution time, \nas shown in Table Ď. T h e s c r i p t f r a g m e n t i s a s f o l l o w s . I t i s \nworth noting that before using Spark engine we need to ensure \nthat Spark had access to the tables in Hive. \n# Test Hive performance with BigBench \n1: $INSTALL_DIR/bin/bigBench runBenchmark –i POWER_TEST \n \n# Test Spark SQL performance with BigBench \n2: $INSTALL_DIR/bin/bigBench runBenchmark –i POWER_TEST \n–e spark_sql \nTABLE VI. EXECUTION TIME FOR ALL QUERIES WITH SF 50(50G DATA ) \nQuery No. Analytic method Execution time (seconds) \nHive Spark SQL \nquery1 UDF/UDTF 296 124 \nquery2 MapReduce 3904 1634 \nquery3 MapReduce 1046 568 \nquery4 MapReduce 3932 989 \nquery5 Machine Learning 535 344 \nquery6 Pure HQL 603 238 \nquery7 Pure HQL 897 260 query8 MapReduce 680 251 \nquery9 Pure HQL 1123 138 \nquery10 NLP/UDF/UDTF 1133 1868 \nquery11 Pure HQL 242 110 \nquery12 Pure HQL 271 146 \nquery13 Pure HQL 361 152 \nquery14 Pure HQL 93 92 \nquery15 Pure HQL 151 124 \nquery16 Pure HQL 823 236 \nquery17 Pure HQL 230 118 \nquery18 NLP/UDF/UDTF 1066 903 \nquery19 NLP/UDF/UDTF 401 317 \nquery20 Machine Learning 341 322 \nquery21 Pure HQL 613 175 \nquery22 Pure HQL 160 128 \nquery23 Pure HQL 254 145 \nquery24 Pure HQL 307 118 \nquery25 Machine Learning 483 350 \nquery26 Machine Learning 249 291 \nquery27 NLP/UDF/UDTF 121 201 \nquery28 Machine Learning 456 510 \nquery29 UDF/UDTF 237 154 \nquery30 UDF/UDTF/MapReduce 3769 922 \n6) Performance analysis of Hive vs. Spark SQL \nAccording to Table Ď, Fig.5 and Fig.6, Spark SQL \nperformance is 1-8 times that of Hive under 14 Pure HQL \nqueries and 5 Hive queries with MapReduce. The main reason \nis that Spark SQL uses memory computing and optimized SQL \nengine. So Spark SQL is more efficient than Hive that uses \nMapReduce as a computing engine. \n \nFig.5. Hive and Spark SQL performance comparison by Pure HQL query \n \nFig.6. Hive and Spark SQL performance comparison by MapReduce query \n236\nFor machine learning workload 㸪Hive and Spark SQL are \nsimilar in performance, since both Hive and Spark SQL use \nSpark MLLIB as a machine learning engine, as shown in Fig.7. \n \nFig.7. Hive and Spark SQL performance comparison by machine learning \nSince NLP programs were written in the Python language, \nneither Hive nor Spark SQL can take advantage of the system’s parallel computing features. As a result, for NLP/UDF/UDTF workload, Hive and Spark SQL performance’s gap is not large, and Hive outperformed Spark SQL even on query 10 and query 27, as shown in Fig.8. \n \nFig.8. Hive and Spark SQL performance comparison by NLP/UDF/UDTF \nFor query10, we modify the parameter of \nspark.sql.shuffle.partition from the default of 200 to 50 to \noptimize the performance of Spark SQL. In Spark SQL, a large number of shuffle partitions means more tasks when shuffle operation occurs. More tasks in Spark SQL will increase the overhead of tasks startup and decrease the performance of the system. As shown in Fig.9, by optimizing Spark SQL’s parameter, query 10 reduce its run time from 1868 seconds to 1376 seconds. \n \nFig.9. Spark SQL performance improvement through optimization \nV. CONCLUSION \nWith the continuous development of big data applications \nand technologies, industry and academia pay more and more attention to the benchmark testing of big data analytics systems. It not only equitably compares the performance of multiple big data analytics systems, but also allows you to tune system parameters and optimize system performance. The paper analyzes the challenges of testing big data analytics system and summarizes the methods and strategies of the test. And the paper presents two cases of benchmark testing for big data analytics systems. In case 1, we present an automated system testing solution for Transwarp Inceptor by TPC-DS in detail, and the test includes system’s functionality, performance, reliability and compatibility of SQL. In case 2, we test and compare the performance of Hive and Spark SQL by TPCx-BigBench, an application oriented end-to-end benchmark. Test results show that the performance of Spark SQL significantly better than Hive on the workload of pure HQL and query with MapReduce. In the future, we will further research new technologies of big data benchmarks [10], such as testing and evaluation of streaming analytics and graph analytics systems. \n \nR\nEFERENCES \n \n[1] A.Thusoo, J.S. Sarma, N. Jain, et al, “Hive-a petabyte scale data \nwarehouse using hadoop”, IEEE 26th International Conference on Data \nEngineering. IEEE, 2010, pp.996-1005. \n[2] M. Zaharia, M. Chowdhury, M. J. Franklin, S. Shenker, et al, “Spark: \nCluster Computing with Working Sets”, Usenix Conference on Hot \nTopics in Cloud Computing, Boston, USA, 2010. \n[3] S. Huang, J. Huang, J. Dai, et al, “The HiBench Benchmark Suite: \nCharacterization of the MapReduce-Based Data Analysis”. ICDE \nWorkshops, 2010, pp. 41 - 51. \n[4] R. O. Nambiar, M. Poess, “The making of TPC-DS”, Proceedings of the \n32nd international conference on Very large data bases. VLDB \nEndowment, 2006, pp.1049-1058. \n[5] M. Poess, R. O. Nambiar, D. Walrath,“Why you should run TPC-DS: a \nworkload analysis”, Proceedings of the 33rd international conference on \nVery large data bases. VLDB Endowment, 2007, pp.1138-1149. \n[6] A. Ghazal, T. Rabl, M. Hu, et al, “BigBench: towards an industry \nstandard benchmark for big data analytics”, Proceedings of the 2013 ACM SIGMOD international conference on Management of data, 2013, \npp.1197-1208. \n237\n[7] TPCx-BigBench Standard Specification Version 1.2.0, November 2016, \nhttp://www.tpc.org/ \n[8] M. Armbrust, R. S. Xin, C. Lian, et al, “Spark sql: Relational data \nprocessing in spark”, Proceedings of the 2015 ACM SIGMOD \nInternational Conference on Management of Data, 2015, pp.1383-1394. \n[9] T. Rabl, M. Frank, H. M. Sergieh, et al, “A Data Generator for Cloud-\nScale Benchmarking”, TPCTC, 2010, pp.41-56. [10] T. Rabl, M. Frank, M. Danisch, et al, “The vision of BigBench 2.0”, \nProceedings of the Fourth Workshop on Data analytics in the Cloud., \nACM, 2015. \n \n \n \n238",
"metadata": {
"filename": "chen2018.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\chen2018.pdf",
- "file_size": 325155,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:36.809043",
- "content_length": 31571
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\chen2018.pdf",
+ "size": 325155,
+ "source": "docs_to_import"
+ },
+ "id": "2b2f0d65-1bc3-407f-b86d-119120dfb357"
},
- "94553f7c-0219-4683-8566-938f0d311229": {
- "id": "94553f7c-0219-4683-8566-938f0d311229",
- "content": "[Página 1]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n1\nAutoDiagn: An Automated Real-time Diagnosis\nFramework for Big Data Systems\nUmit Demirbaga, Zhenyu Wen\u0003Member, IEEE , Ayman Noor, Karan Mitra, Member, IEEE , Khaled\nAlwasel, Saurabh Garg, Albert Zomaya, Fellow, IEEE , Rajiv Ranjan, Senior Member, IEEE\nAbstract—Big data processing systems, such as Hadoop and Spark, usually work in large-scale, highly-concurrent, and multi-tenant\nenvironments that can easily cause hardware and software malfunctions or failures, thereby leading to performance degradation.\nSeveral systems and methods exist to detect big data processing systems’ performance degradation, perform root-cause analysis, and\neven overcome the issues causing such degradation. However, these solutions focus on specific problems such as stragglers and\ninefficient resource utilization. There is a lack of a generic and extensible framework to support the real-time diagnosis of big data\nsystems. In this paper, we propose, develop and validate AutoDiagn. This generic and flexible framework provides holistic monitoring of\na big data system while detecting performance degradation and enabling root-cause analysis. We present an implementation and\nevaluation of AutoDiagn that interacts with a Hadoop cluster deployed on a public cloud and tested with real-world benchmark\napplications. Experimental results show that AutoDiagn can offer a high accuracy root-cause analysis framework, at the same time as\noffering a small resource footprint, high throughput and low latency.\nIndex Terms—Root-cause analysis, Big data systems, QoS, Hadoop, Performance\nF\n1 I NTRODUCTION\nThe rapid surge of data generated through sectors like\nsocial media, financial services and industries has led to\nthe emergence of big data systems. Big data systems enable\nthe processing of massive amounts of data in relatively\nshort time frames. For instance, the Netflix big data pipeline\nprocesses approximately 500 billion events and 1.3 petabytes\n(PB) of data per day, further, during peak hours, it processes\napproximately 11 million events and 24 gigabytes (GB) of\ndata on a per-second basis. Facebook has one of the largest\ndata warehouses in the world, capable of executing more\nthan 30,000 queries over 300 PB data every day. However,\nthe enormousness and complexity of the big data system\nruns in heterogeneous computing resources, multiple tenant\nenvironments, as well as has many concurrent execution of\nbig data processing tasks, which makes it a challenge to\nutilize the big data systems efficiently and reliably[1]. For\nexample, Fig. 1 shows that the performance degrades at\nleast 10% when the resources are not utilized efficiently with\nSetting 2.\n\u000fU. Demirbaga is with Newcastle University, United Kingdom and Bartin\nUniversity, Turkey. E-mail: u.demirbaga2@newcastle.ac.uk\n\u000fZ. Wen is with Newcastle University, United Kingdom. E-mail:\nzhenyu.wen@newcastle.ac.uk, corresponding author.\n\u000fA. Noor is with Newcastle University, United Kingdom and Taibah\nUniversity, Saudi Arabia. E-mail: anoor@taibahu.edu.sa\n\u000fK. Mitra is with Lule˚ a University of Technology, Sweden. E-mail:\nkaran.mitra@ltu.se\n\u000fK. Alwasel is with Newcastle University, United Kingdom and Saudi\nElectronic University, Saudi Arabia. E-mail: kalwasel@gmail.com\n\u000fS. Garg is with University of Tasmania, Australia. E-\nmail:Saurabh.Garg@utas.edu.au\n\u000fA. Zomaya is with Sydney University, Australia, E-mail: al-\nbert.zomaya@sydney.edu.au\n\u000fR. Ranjan is with Newcastle University, United Kingdom. E-mail:\nraj.ranjan@newcastle.ac.uk\n 0 50 100 150 200 250 300 350\nWordCountGrepTPC-HTPC-DS K-means PageRankMakespan (sec)\nBig data applicationsSetting 1 Setting 2Fig. 1. Six big data applications are executed in a cloud-based Hadoop\ncluster with two settings: 1) the input data and jobs are allocated in\nthe same node; 2) the input data and jobs are allocated in different\nnodes. In Setting 2, the execution time of each application is delayed\nby transmitting data across nodes.\nTo overcome this, it is imperative to continuously mon-\nitor and analyze all available system resources at all times\nin a systematic, holistic and automated manner. These re-\nsources include CPU, memory, network, I/O and the big\ndata processing software components.\nMost of the commercial [2][3][4] and academic big\ndata monitoring systems mainly focus on visualizing task\nprogress, and the system’s resource utilization [5]. How-\never, they do not focus on the interaction between multiple\nfactors and performing root-cause analysis for performance\ndegradation [6][7]. Moreover, works such as [8], [9] aim to\nfind the best parameters to optimize the performance of\nManuscript received ???; revised ???\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 2]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n2\nbig data processing systems, they do not focus on the root-\ncause analysis that may indicate the viable reasons behind\nperformance degradation and may provide intuitions for\nparameter tweaking.\nMantri [10] presents a systematic method that catego-\nrizes the main reasons causing outliers in a big data system.\nThe authors’ work was focused on the MapReduce pro-\ngramming framework in the Hadoop system; they do not\ndiscuss how Mantri can be applied to other big processing\nframeworks (e.g., Apache Spark1, and Apache Flink2). Gar-\nraghan et al. [11] proposed an online solution to detect long-\ntail issues in a distributed system. However, these solutions\nwere built for specific scenarios with much scope left for\nanalyzing a variety of problems that can exist in a large\nscale big data processing system.\nTo the best of our knowledge, there is a lack of a generic\nand comprehensive solution for the detection of a wide\nrange of anomalies and performance of root-cause analysis\nin big data systems. Developing a general and extensible\nframework for diagnosing a big data system is not trivial.\nIt requires well-defined requirements which could enable\nthe broader adoption of root-cause analysis for the big\ndata systems, flexible APIs to interact with an underlying\nmonitoring system and integration of multiple solutions for\ndetecting performance reduction problems while enabling\nthe automatic root-cause analysis. In this paper, we tackle\nthis research gap, and design and develop AutoDiagn to au-\ntomatically detect performance degradation and inefficient\nresource utilization problems, while providing an online\ndetection and semi-online root-cause analysis for a big data\nsystem. Further, it is designed as a microservice architecture\nthat offers the flexibility to plug a new detection and root-cause\nanalysis module for various types of big data systems.\nThe contributions of this paper are as follows:\n\u000fAn online and generic framework: We develop a general\nframework called AutoDiagn which can be adapted for\nthe detection of a wide range of performance degrada-\ntion problems while pinpointing their root-causes in big\ndata systems.\n\u000fA case study: We develop a novel real-time stream pro-\ncessing method to detect symptoms regarding outliers\nin a big data system. After that, we develop a set of\nquery APIs to analyze the reasons that cause the outlier\nregarding a task.\n\u000fA comprehensive evaluation: We evaluate the feasibility,\nscalability and accuracy of AutoDiagn through a set of\nreal-world benchmarks over a real-world cloud cluster.\nThe paper is organized as follows. The design require-\nments and idea are outlined in §2. In §3, we illustrate the\nhigh-level system architecture. §4 presents a case study that\nwe implemented and the case study is evaluated in §5. §6\ndiscusses the limitations of this paper and highlights our\nfurther work . Before drawing a conclusion in §8, we discuss\nthe related work in §7.\n1. https://spark.apache.org/\n2. https://flink.apache.org/2 R EQUIREMENTS AND DESIGN IDEA\nIn this section, we analyze the key requirements of the\nreal-time big data diagnosis system, extracting the essential\nfeatures from the literature. Next, we present the key idea\nof the framework design.\n2.1 Fundamental prerequisite for diagnosing big data\nprocessing systems\nIn order to design a generic framework for diagnosing big\ndata processing systems, we classified the fundamental re-\nquirements of building a diagnosis system on such systems\nas follows:\n\u000fInfrastructure monitoring: Collecting the information\nabout the underlying system, such as network condi-\ntions, CPU utilization, memory utilization, and disk\nI/O status.\n\u000fTask execution monitoring: Collecting the task infor-\nmation, including execution time, progress, location,\nlocation of its input data, input data size, output data\nsize, CPU/memory usage, and process state (running,\nwaiting, succeeded, failed, killed).\n\u000fAbnormal behavior or fault detection: Detecting ab-\nnormal behaviors in big data processing systems, such\nas slowing tasks, failed tasks, very high/low resource\nusage, and experiencing very high response time for the\nrequests.\n\u000fRoot-cause analysis: Finding the root cause of perfor-\nmance reduction in big data processing systems, such\nas the reasons why: tasks are slowing down, resource\nutilization is low, the response time is high, or when the\nnetwork latency is high.\n\u000fVisualization: Visualizing the collected metrics and\nthe results of root-cause analysis of any failures caus-\ning performance reduction in the cluster with a user-\nfriendly interface in real-time.\n2.2 Key design idea\nMotivated by the above-mentioned requirements and in-\nspired by medical diagnosis, we highlight the design idea\nof root-cause analysis for big data processing systems as\nshown Fig. 2, which aims to provide holistic monitoring\nand root cause analysis for big data processing systems.\nFirst, a set of Symptom Detectors is defined and developed in\nSymptom Detection to detect the abnormalities of the big\nsystem by processing collected system information stream\nin real-time. Once a symptom (abnormality) is detected,\ntheDiagnosis Management may launch the corresponding\nDiagnosers to troubleshoot the cause of the symptom. One\nsymptom may correspond to root causes. Finally, the deci-\nsions are made based on the root-cause analysis results.\n2.3 The generalizability of AutoDiagn\nModern big data processing systems consists of two main\ntypes: Big data analytics (e.g., Hadoop, Spark) and Stream\nprocessing (e.g., Flink, Spark Stream). Based on our de-\nsign idea, our AutoDiagn is an independent framework\nthat can be deployed alongside existing big data cluster\nmanagement systems (e.g., Apache YARN), and ideally it\nis suitable for root-cause analysis of any big data processing\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 3]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n3\nsystem. However, for the scope of this paper and practi-\ncal certainty, the implementation of AutoDiagn focuses on\ndebugging root causes of performance degradation (e.g.,\nslow task execution time) in Hadoop due to faults such as\ndata locality, cluster hardware heterogeneity, and network\nproblems (e.g., disconnection). Although we have validated\nthe functionality of AutoDiagn in the context of Hadoop and\nconsidering different classes of workload (e.g., WordCount,\nGrep, TPC-H, TPC-DC, K-means clustering, PageRank), it is\ngeneralizable to other big data processing systems executing\nsimilar classes of workload.\n3 A UTODIAGN ARCHITECTURE\nFollowing the design idea laid out in §2, we introduce Auto-\nDiagn, a novel big data diagnosing system. We first illustrate\nthe high-level system architecture and then describe the\ndetails of each component. AutoDiagn is implemented in\nJava and all source code is open-source on GitHub3.\n3.1 Architecture overview\nAutoDiagn provides a systematic solution that automati-\ncally monitors the performance of big data systems while\ntroubleshooting the issues that cause performance reduc-\ntion. Fig. 3 shows its two main components: AutoDiagn\nMonitoring and AutoDiagn Diagnosing. AutoDiagn Monitoring\ncollects the defined metrics (logs) and feeds AutoDiagn Diag-\nnosing with them in real-time. Once the abnormal symptoms\nare detected by analyzing the collected metrics, a deeper\nanalysis is conducted to troubleshoot the cause of abnormal\nsymptoms.\nAutoDiagn Monitoring. AutoDiagn Monitoring is a de-\ncentralized real-time stream processing system that collects\ncomprehensive system information from the big data system\n(e.g., Hadoop Cluster). The Collected Metrics is a set of\npre-defined monitoring entities (e.g., CPU usage, memory\nusage, task location, task status) used to detect the abnormal\nsymptoms. Moreover, the system information, required for\nunderstanding the cause of detected abnormal symptoms,\nis collected in this modular.\nAutoDiagn Diagnosing. AutoDiagn Diagnosing is an event\nbased diagnosing system. First, the carefully crafted metrics\nare injected into the Symptom Detection Engine which is a\nreal-time stream processing module to detect the abnormal\nsymptoms in a big data system. In this paper, we use\nthe outlier which is a common symptom for performance\nreduction in a Hadoop cluster as a case study to demon-\nstrate the proposed framework. §4.1 illustrates the details\nof technology that we developed for symptom detection.\nMoreover, our system follows the principle of modular\nprogramming; the new symptom detection method can be\neasily plugged in. Diagnoser Plugins is a component for\ntrouble-shooting the reasons behind the detected symptom.\nA set of Diagnosers is instantiated by the Diagnoser Manager\nwhen their corresponding symptoms are detected. Then\nthe instantiated Diagnosers query a time series database to\nobtain the required input and their outputs illustrate the\ncause of the detected symptoms.\n3. https://github.com/umitdemirbaga/AutoDiagn3.2 AutoDiagn monitoring framework\nAutoDiagn monitoring framework is a holistic solution for\ncontinuous information collection in a big data cluster.\nThe framework needs to have a fast, flexible and dynamic\npipeline to transfer the collected data as well as a high per-\nformance, large scale storage system. We now describe an\nimplementation of the framework for a big data computer\ncluster, and the high-level system architecture is shown in\nFig. 4.\nInformation Collection. In each compute node, we develop\nand deploy an Agent to collect real-time system information.\nFor the worker node, the Agent collects the usage of com-\nputing resource via SIGAR APIs4, including CPU, memory,\nnetwork bandwidth, and disk read/write speeds. Moreover,\ntheAgent in the master node collects the usage of computing\nresource as well as the job and tasks information. The Filter\nis developed by using GSon Library5to remove the less im-\nportant information obtained from ResourceManager REST\nAPI’s6, thereby reducing the size of data transmission. The\ncollected information is sent to RabbitMQ7cluster which is\na lightweight and easy-to-deploy messaging system in each\ntime interval via Publisher.\nStorage. The acquired information is time series data, we\ntherefore choose InfluxDB8for data storage. InfluxDB is a\nhigh performance, scalable and open source time series data\nbase which provides a set of flexible open APIs for real-time\nanalytics. The Consumer subscribes the related stream topics\nfrom RabbitMQ and interacts with InfluxDB APIs to inject\nthe information to the data base.\nInteracting with AutoDiagn Diagnosing. The information\nrequired for symptom detection is directly forwarded and\nprocessed in AutoDiagn diagnosing via a consumer. If a\nsymptom is detected, InfluxDB will be queried by AutoDi-\nagn diagnosing for root-cause analysis. Finally, the analysis\nresults are sent back to the database to be stored.\nUser visualization. The user visualization allows the users\nto have a visible way to monitor their big data system. We\nutilize InfluxDB’s client libraries and develop a set of REST-\nful APIs to allow the users to query various information,\nincluding resource utilization, job and task status, as well as\nroot cause of performance reduction.\n3.3 AutoDiagn diagnosing framework\nIn this section, we discuss the core components of the\nAutoDiagn Diagnosing framework (see Fig. 3), as well as the\ninteractions with each other and the AutoDiagn Monitoring\nframework.\nSymptom Detection Engine. The symptom detection en-\ngine subscribes a set of metrics from the real-time streaming\nsystem. §4.1 illustrates the technique that we developed\nfor outlier detection. This component follows microservices\narchitecture to which new symptom detection techniques\ncan be directly attached to our AutoDiagn, interacting with\nother existing techniques to detect new symptoms.\n4. https://github.com/hyperic/sigar\n5. https://github.com/google/gson\n6. https://hadoop.apache.org/docs/r3.2.1/hadoop-yarn\n7. https://www.rabbitmq.com/\n8. https://www.influxdata.com/\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 4]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n4\nSymptom Detection Diagnosis Management Decision MakingSymptoms \n(N)\nRoot -cause 1\n•\n•\n••\n•\n•Root -cause 2\nRoot -cause M•\n•\n•Root -cause 3Diagnosis \n(M)\nRoot -cause 4MetricsSymptom \nDetector 2\nSymptom \nDetector NSymptom \nDetector 1Diagnoser 1\nDiagnoser 2\nDiagnoser 3\nDiagnoser 4\nDiagnoser MDecision 1\nDecision 2\n•\n•\n•\nDecision N Root -cause M -1 Diagnoser M -1\nFig. 2. The key design idea of root-cause analysis for big data processing systems\nAutoDiagn Diagnosing\nDiagnoser Plugins\nDiagnoser 1\nTask\nInput\nOutput\n…\nDiagnoser N\nTask\nInput\nOutputAutoDiagn Monitoring\nSymptom\nDetection\nEngine\nDiagnosis \ndecisionsCollected \nmetricsDetected\nSymptoms\nRoot -causes of the symptoms \nDiagnoser \nManager\nFig. 3. The high-level architecture of the AutoDiagn system\nDiagnoser Manager. The diagnoser manager is the core\nentity responsible for selecting the right diagnosers to find\nthe reasons that cause the detected symptoms. Additionally,\nthe diagnoser manager is developed as a front-end com-\nponent, triggered by various detected symptoms (events)\nvia a RESTful API, exposing all diagnosing actions within\nour framework. The API includes general actions such as\nstarting, stopping or loading a diagnoser dynamically, and\nspecific actions such as retrieving some metrics. Importantly,\nthe diagnoser manager is able to compose a set of diagnosers\nto complete the diagnosing jobs that may require the coop-\neration of different diagnosers.\nDiagnoser Plugins. The diagnoser plugin contains a set of\ndiagnosers; and a diagnoser is the implementation of the\nspecific logic to perform root-cause analysis of a symptom.\nEach diagnoser refers to a set of metrics stored in a time\nseries database as the input of its analysis logic. Whenever\nit is activated by the diagnoser manager, it will perform\nan analysis, querying the respective metrics, executing the\nanalytic algorithm, and storing the results. §4.2 discusses the\nalgorithms to detect the outlier problems, for example, in aHadoop cluster. The diagnoser plugin is also designed as\na microservice architecture which has two advantages: i) a\nnew diagnoser can be conveniently plugged or unplugged\non-the-fly without affecting other components; ii) new root-\ncause analysis tasks can be composed by a set of diagnosers\nvia RESTful APIs.\n3.4 AutoDiagn diagnosing interfaces for Hadoop\nAutoDiagn exposes a set of simple interfaces for system\nmonitoring, symptom detection and root-cause analysis.\nTable 1 shows that two types of APIs are defined: high-\nlevel APIs and low-level APIs. The high-level APIs consist\nofSymptom Detection, Diagnoser and Decision Making.\nThe Symptom Detection APIs are a set of real-time stream\nprocessing functions used to detect the defined symptoms\ncausing the performance reduction in the Hadoop system.\nEach Diagnoser is a query or a set of queries, which aim\nto find one of the causes of a symptom. For example,\nQueryNonLocal() tries to find all non-local tasks within a\ntime interval, which is one of the reasons that causes an out-\nlier. Finally, the Decision Making APIs are used to analyze\nthe results from each Diagnoser and make the conclusion.\nThese high-level APIs have to interact with the low-level\nAPIs (Information Collection) to obtain system information\nincluding resource usage, and the execution information of\nthe big data system (e.g., ask and job status in a Hadoop\nsystem). Based on this flexible design, users can define\nand develop their own Symptom Detection, Diagnoser and\nDecision Making APIs and plug them into AutoDiagn.\n3.5 Example applications\nWe now discuss several examples for big data system root\ncause applications using AutoDiagn API.\nOutliers. Outliers are the tasks that take longer to finish\nthan other similar tasks, which may prevent the subse-\nquent tasks from making progress. To detect these tasks,\nthe real-time stream query QueryOutlier() is enabled\nin the Symptom Detection Engine. This function consumes\neach task’s completion rate (i.e., progress) and the executed\ntime to identify the outlier tasks (detailed in §4.1). Next,\nthree APIs QueryNonlocal(), QueryLessResource()\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 5]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n5\nComputer Cluster\nMaster Node\nPublisher FilterCollector AgentResource\nInformationTask\nInformation\n…Message \nBrokerAutoDiagn \nDiagnosingManagement Node\nUser \nVisualization\n StorageConsumer\nConsumer\nWorker Node 1Publisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nWorker Node 2Publisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nWorker Node NPublisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nFig. 4. The high-level architecture of the monitoring framework\nandQueryNodeHealth(), corresponding to three Diag-\nnosers that are used to analyze the reasons causing the de-\ntected symptom, are executed. QueryNonlocal() queries\nwhether the input data is allocated on the node on which\nan outlier task is processed. In addition, QueryLessRe-\nsource() investigates whether outlier tasks are running\non the nodes that have less available resource. Moreover,\nQueryNodeHealth() examines if an outlier task is the\ntask that is a restarted task due to the disconnected nodes\nfrom the network. Finally, RootcauseOutlier() is used\nto process the results from the three Diagnosers and make\nthe conclusion. All the APIs are shown in Table 1 and the\ntechnical details are illustrated in §4.\nInefficient resource utilization. In our case this means that\nsome tasks are pending (or waiting) to be on worker nodes;\nat the same time, some worker nodes are idle, e.g., low CPU\nand memory usage. There are many reasons that cause this\nissue, but here we consider two key causes: task heterogeneity\nand resource heterogeneity. The type of tasks in a big data sys-\ntem are various, including CPU intensive tasks, IO intensive\ntasks and memory intensive tasks. However, the underlying\ncomputing resources are typically equally distributed to\nthese tasks, thereby causing inefficient resource utilization.\nThe latter is caused by the heterogeneous underlying com-\nputing resources due to the multiple concurrent processing\ntask environments and the queues are built on the saturated\nnodes.\nTo detect the inefficient resource utilization in a big data\nsystem, the real-time stream query QueryResourceU-\ntil() is used within a defined time interval. We com-\npute the mean and standard deviation of the usage re-\nsources of the whole cluster. If the standard deviation\nis far from the mean, we will further query whether\nthe tasks are queued on the nodes which have high\nresource usage rates. If inefficient resource utilization\nis detected, two Diagnosers, QueryOversubscribed()\nand QueryDiskIOboundTasks(), which are the root-\ncause analysis APIs shown in Table 1, are executed toperform root-cause analysis. QueryOversubscribed()\nchecks the type of tasks queuing on the saturated nodes.\nTheQueryDiskIOboundTasks() checks whether the sat-\nurated nodes have less available computing resource,\nwhile processing the allocated tasks. The conclusion of the\ncause of inefficient resource utilization is made in Root-\ncauseResInef().\n3.6 Parallel execution\nFollowing the key design idea, the diagnosers are triggered\nby the corresponding detected symptom. However, we are\nable to parallelize the execution of each symptom detector\nand its diagnosers by partitioning the input data. For ex-\nample, if one symptom detector needs to process too many\ndata streams, we can use two of the same instances of the\nsymptom detector to process the data streams and aggregate\nthe results from two symptom detectors. The diagnoser can\nfollow the same strategy for parallel execution.\n3.7 Reliability analysis\nAutoDiagn follows the centralized design for data collec-\ntion, which simplifies the implementation of the Symptom\nDetection, Diagnosis Management and Decision Making. They\ncan easily obtain the required information from one place,\ninstead of interacting with the entire big data system. More-\nover, the centralized design does not mean unreliability, due\nto the high-availability of RabbitMQ. The RabbitMQ cluster\ncan overcome the node fail in the message queuing system\nwhile ensuring scalability.\n4 C ASESTUDY\nIn the previous section, we have discussed that our frame-\nwork supports detection of multiple types of symptoms\n(e.g., outliers, inefficient resource utilization). However, de-\ntecting these symptoms is non-trivial; and each symptom\ncan be detected by using different algorithms with different\ninput metrics. In this section, we present a case study that\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 6]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n6\nTABLE 1\nAutoDiagn diagnosing interface. See §3.4 for definitions and examples\nSymptom Detection (High-level APIs) Description\nQueryOutlier() Execute a Query that returns the list of outliers if any.\nQueryResourceUtil() Execute a Query that returns the list of the worker nodes in which the computing resources are not uti-\nlized effectively if any.\nDiagnoser (High-level APIs) Description\nQueryNonLocal() Execute a Query that return the list of non-local tasks if any.\nQueryLessResource() Execute a Query that returns false if the cluster is not homogeneous in terms of having resource capacity (CPU/memory).\nQueryNodeHealth() Execute a Query that returns the list of disconnected worker nodes in the cluster if any.\nQueryOversubscribed() Execute a Query that returns the list of the oversubscribed tasks if any.\nQueryDiskIOboundTasks() Execute a Query that returns the list of the disk- or IO-bound tasks if any.\nDecision Making (High-level APIs) Description\nRootcauseOutlier() Execute a Query that illustrate the main reason of the cause of the outlier.\nRootcauseResInef() Execute a Query that illustrate the main reason of the cause of inefficient resource utilization.\nInformation Collection (Low-level APIs) Description\ntaskExecTime() Return the execution time since the task started in sec.\ntaskProgress() Return the progress of the running task as a percentage.\ntaskInput() Return the input data size of the running task in mb.\ntaskBlock() Return the block id this task process.\ntaskHost() Return the name of the node thistask ran on.\ntaskCPUusage() Return the CPU usage of the task.\ntaskMemoryUsage() Return the memory usage of the task.\ntaskContainerCPU() Return the allocated CPU to the container this task ran on.\ntaskContainerMemory() Return the allocated memory to the container this task ran on.\nblockHost() Return the names of the nodes that host the block.\npendingTasks() Return the number of the tasks waiting to be run.\nnodeTotalCoreNum() Return the number of the CPU core number of the node.\nnodeCPUUsage() Return the CPU utilization of the node.\nnodeTotalMem() Return the total memory capacity of the node.\nrestartedTasks() Return the name of the restarted tasks due to nodes that got disconnected from the network.\nnodeMemUsage() Return the memory utilization of the node.\nnodeDiskReadSpeed() Return the disk read speed of the node.\nnodeDiskWriteSpeed() Return the disk write speed of the node.\nnodeUploadSpeed() Return the network upload speed of the node.\nnodeDownloadSpeed() Return the network download speed of the node.\ndetails the technology of detecting outliers and the root-\ncauses analysis for the detected outliers. The notations used\nin this paper are summarized in Table 2.\nTABLE 2\nA summary of symbols used in the paper\nSymbols Description\nJp Job progress\nN Name of the task\nNl List ofN\nP Performance of the N\nPl List ofP\nO Progress of theN\nOl List ofO\nT Execution time of the N\nTl List ofT\nmed The performance of median task\nD Non-local tasks\nDl List of Non-local task\nR Task running on the node with less resources\nRl List ofR\nW Restarted tasks due to the nodes’ network failure\nWl List ofW\nSl List of outlier task\nSd Non-local outlier\nSdl List of Sd\nSr Outlier stemming from the resource variation\nSrl List of Sr\nSw Outlier stemming from disconnected nodes\nSwl List of Sw\nF Factor value of 1.5 used to find the S4.1 Symptom detection for outliers\nAnanthanarayanan et al. [10] defined the outlier tasks’ run-\ntime to be 1.5 times higher than that of the median task\nexecution time; their method is based on the assumption\nthat all tasks are started at the same time and are the same\ntype (i.e., the same input data and the same processing\ncode), which is not suitable for real-time symptom detection,\nbecause in a time interval the tasks may be submitted at\ndifferent times; the input data size of the tasks and the code\nfor tasks are not always the same. In this paper, we use\nPerformance (P) to measure the outlier as shown in Eq 1. O\nrepresents the normalized value of the task progress in terms\nof percent work complete, and Tis the normalized value of\nthe task execution time.\nP=O\nT(1)\nEq 2 is used to normalize the OandT, where xmin and\nxmax are the minimal and maximal values of the given\nmetrics (eg., task progress and execution time) in a time\ninterval. We set b= 1 anda= 0:1 to restrict the normalized\nvalues within the range from 0.1 to 1 [12].\nxnorm =a+(x\u0000xmin)(b\u0000a)\nxmax\u0000xmin(2)\nMoreover, we define the outlier tasks which have 1.5\ntimes less performance value than the median performance\nvalue in each time interval. Fig. 5 shows a snapshot of a time\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 7]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n7\nAlgorithm 1: Automated symptom detection for\noutliers\nInput: Jp- job progress in percentage,\nF- factor,\nN- name of the running task,\nNl- list ofN,\nO- progress of the task,\nOl- list ofO,\nT- execution time of the task,\nTl- list ofT.\nOutput: Sl- list of outliersS.\n1// Create a list Slto store theS\n2Sl Sl[0]\n3// Initialize the med\n4med med[0]\n5while Jp<100.0 do\n6 //Clear the SlandPl\n7 Sl Clear (Snew\nl ,Sl)\n8 Pl Clear (Pnew\nl ,Pl)\n9 foreachNinNldo\n10 //ComputeP\n11P=O\nT\n12 //Insert thePinto the Pl\n13 Pl.add(P )\n14 end\n15 //Get themedfrom thePl\n16 med Median value of Pl\n17 foreach value of Pldo\n18 if(P*F)< m edthen\n19 //Insert theNinto theSl\n20 Sl.add(N )\n21 end\n22 end\n23 //Update the SlinDiagnosis Generation component\n24 Sl Update (Snew\nl ,Sl)\n25 //Update the Nl,Ol,Tl,Jp\n26 Nl Replace (Nnew\nl ,Nl)\n27 Ol Replace (Onew\nl ,Ol)\n28 Tl Replace (Tnew\nl ,Tl)\n29 Jp Replace (Jnew\np ,Jp)\n30end\ninterval (e.g., three seconds), and two mappers are identified\nas outliers. More evaluations will be discussed in §5.\nAlgorithm 1 demonstrates the proposed ASD (auto-\nmated symptom detection) algorithm in the AutoDiagn\nsystem. It is fed by the streaming data provided by the\nAutoDiagn Monitoring system during job execution. First,\nthe performance of each running task is calculated (see\nAlgorithm 1, Line 11) using Eq 1. Next, the median value\nof the performance of all tasks is taken to be used to detect\noutliers (see Algorithm 1, Line 16). Then, the tasks whose\nperformance is 1.5 times less than the performance of the\nmedian task are selected as outliers (see Algorithm 1, Line\n20). As a final step, these tasks detected as outliers are sent to\ntheDiagnosis Generation component for root-cause analysis\n(see Algorithm 1, Line 24).\n4.2 Root cause analysis for outliers\nWhen the detected symptoms are passed to the Diagnoser\nManager, the corresponding Diagnosers are executed for\ntrouble-shooting. The following subsection illustrates the\ntechnologies that we have developed for analyzing the\ncauses of outliers in a Hadoop cluster.\n4.2.1 Root cause of outliers\nIn this paper, we follow the three main reasons that cause\noutliers, discussed in [10], i.e., Data locality, Resource het-\nerogeneity, and Network failures.\nProgress (%)Execution time (sec) 0 1 2OutliersMedian=1.11Performance levels \n 30 35 40 45 50 55 60 65 14 16 18 20 22 24 26 28 30 32\nPerformance 0.2 0.4 0.6 0.8 1 1.2 1.4\nFig. 5. Performance evaluation of the tasks\nData locality. Hadoop Distributed File System (HDFS)\nstores the data in a set of machines. If a task is scheduled to\na machine which does not store its input data, moving data\nover the network may introduce some overheads to cause\nthe outliers issue.\nResource heterogeneity. The machines in a Hadoop cluster\nmay be homogeneous with the same hardware configura-\ntion, but the run-time computing resources are very hetero-\ngeneous due to the multiple talents environment, multiple\nconcurrent processing task environment, machine failures,\nmachine overloaded etc. If a task is scheduled to a bad\nmachine (e.g., has less computing resource) it may cause\nan outlier issue. Moreover, resource management systems\nfor a large-scale cluster like YARN split the tasks over the\nnodes equally without considering the resource capacities of\nthe nodes in the cluster, but only takes into account sharing\nthe node’s resources among the tasks running on the node\nequally by default [13]. That is more likely to raise an outlier\nproblem in the cluster.\nNetwork failure. In Hadoop clusters, the network discon-\nnection can cause the running tasks allocated on a discon-\nnected node to be restarted on other nodes, which may lead\nto the task becoming an outlier and, increase the completion\ntime. The following illustrates the three algorithms that\nwe developed to identify the outliers caused by the three\nreasons.\n4.2.2 Detecting data locality issues\nWe assume that a non-local task (D ) (e.g., mapper) is ex-\necuted on a node where its input data is not stored (In the\nfollowing, we use Sdto represent non-local outliers). To detect\nthese tasks, we develop Algorithm 2 to check whether a set\nof outliers is caused by a data locality issue. The input of\nour algorithm is a list of detected outliers during the time\ninterval from ttot+ 1 and one of its outputs is a list of\noutliers which also belongs to the non-local tasks. First, we\nquery our time series database to obtain all non-local tasks\nwithin the given time interval (see Algorithm 2, Line 2).\nHere, QueryNonLocal(), a root-cause analysis API, is\nused to find the non-local ones among the running tasks\nin that period of time. It compares the location where the\ntask is running (host node of the task) with the nodes\nwhere the data block is replicated for fault tolerance via\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 8]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n8\ninformation collection APIs shown in Table 1, taskHost()\nandblockHost(). If the task is not running on any of\nthese nodes (nodes hosting a copy of the block), this task\nis marked as a non-local task. In the second step (Algorithm\n2, Line 4), we obtain the common elements of list DlandSl.\nThese elements symbolize the non-local outliers stemming\nfrom a data locality issue.\n4.2.3 Detecting resource heterogeneity issues\nAlgorithm 2 is designed to identify the outliers caused by\nthe resource heterogeneity. The tasks running on the nodes\nwhich have less computing resource (R ) tend to be outliers\n[14] (in the following, we use Srto represent outliers running\non the nodes which have less computing resource). In Algorithm\n2, the list of detected outliers during the time interval from\nttot+ 1 is used as input and one of the outputs of the\nalgorithm is a list of outliers which also belongs to the tasks\nrunning on the node with less computing resource. The time\nseries database is queried to obtain all the tasks running on\nthe node with less computing resource within the given time\ninterval (see Algorithm 2, Line 6).\nHere, QueryLessResource(), a root-cause analysis\nAPI, is used to check the heterogeneity of the nodes that host\nonly the running tasks based on the resource specifications\nof them in that period of time. It detects the nodes with less\nresource capacity in terms of CPU core numbers and the to-\ntal amount of memory among the nodes hosting the running\ntasks. The resource specifications of the nodes (i.e., CPU\ncore numbers, total amount of memory) are obtained from\neach node via information collection APIs shown in Table 1,\nnodeTotalCoreNum() andnodeTotalMem() APIs. As a\nsecond step (Algorithm 2, Line 8), we obtain the common\nelements of list RlandSl. These elements symbolize the\noutliers stemming from a cluster heterogeneity issue.\n4.2.4 Detecting network failure issues\nSince Slis obtained from Algorithm 1, a Diagnoser is exe-\ncuted via QueryNodeHealth() to find all restarted tasks\ndue to the nodes disconnected by network failure within the\ngiven time interval (see Algorithm 2, Line 10). The low-level\nAPIrestartedTasks() is called which distinguishes the\nrestarted tasks due to network failure from the speculation\nof straggler tasks by analyzing the information of the tasks\nthat is provided by the monitoring agent. Thereafter, we\ncompute the list Swlthat contains the outlier tasks caused\nby the network failure (see Algorithm 2, Line 12).\n4.2.5 Decision making\nIn this case study, we use a simple decision make method\nthat compares the lists Sdl,SrlandSwland the probability\nof the reasons causing the outliers by using the number\nof the elements of a list divided the total number of out-\nlier tasks. For instance, the probability of the performance\nreduction caused by data locality isjSdlj\njSlj. More advanced\nmethods such as deep learning models can be used for pro-\ncessing more complicated decision making tasks in future\nwork.Algorithm 2: Root-cause analysis of outliers\nInput: Sl- list of outliers in time interval from ttot+ 1\nOutput: Sdl- list of non-local outliers Sd,\nSrl- list of outliers stemming from resource variation Sr,\nSwl- list of outliers stemming from disconnected nodes Sw.\n1// Find allDwithin the given time interval\n2Dl QueryNonLocal(t, t+1)\n3//Find the common elements in the DlandSl, and add them\ninto theSdl\n4Sdl RetainAll (Dl,Sl)\n5// Find allRwithin the given time interval\n6Rl QueryLessResource(t, t+1)\n7//Find the common elements in the RlandSl, and add them\ninto theSll\n8Srl RetainAll (Rl,Sl)\n9// Find allWwithin the given time interval\n10Wl QueryNodeHealth(t, t+1)\n11//Find the common elements in the WlandSl, and add them\ninto theSwl\n12Swl RetainAll (Wl,Sl)\n5 E VALUATION\nIn this section, we present a comprehensive evaluation\nshowing the capacity and the accuracy rate of AutoDiagn,\nas well as a analysis of its resource consumption and over-\nheads.\n5.1 Experimental setup\nEnvironments. We set up the Hadoop YARN clusters over\n31 AWS nodes with 1 master and 30 slaves with the Oper-\nating system of each node being Ubuntu Server 18.04 LTS\n(HVM). The Hadoop version is 3.2.1 and the Hive version\nis 3.1.1. To meet our experimental requirements, we built\ntwo types of cluster. In Type I each node has the same\nconfiguration (i.e., 4 cores and 16 GB memory). In Type II,\n25 nodes have 4 cores and 16 GB memory and 6 nodes have\n2 cores and 4 GB memory.\nBenchmarks and workload. We used six well-known\nHadoop benchmarks in our evaluations namely: Word-\nCount9, Grep10, TPC-H11, TPC-DS12, K-means clustering13,\nand PageRank14. The input of each benchmark application\nis 30GB.\nMethodology. Our experiments aim to evaluate the effec-\ntiveness of AutoDiagn. To this end, we manually inject the\nabove-mentioned three main reasons to cause the outliers,\nwhich can be summarized as three types of execution en-\nvironment. EnvA: we perform all benchmark experiments\nin the cluster Type I. EnvB: we perform all benchmark\nexperiments in the cluster Type I, but skew the input size\nstored on different nodes. EnvC: we perform all benchmark\nexperiments in the cluster Type II (a heterogeneous cluster).\nEnvH: we perform all benchmark experiments in the cluster\nType I, and disconnect some nodes’ network during execu-\ntion. Each benchmarking is repeated 5 times and results are\nreported as the average and standard deviation. In total,\nthere are 90 experiments conducted in our evaluation.\n9. http://wiki.apache.org/hadoop/WordCount\n10. http://wiki.apache.org/hadoop/Grep\n11. http://www.tpc.org/tpch/\n12. http://www.tpc.org/tpcds/\n13. https://en.wikipedia.org/wiki/K-means clustering\n14. https://en.wikipedia.org/wiki/PageRank\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 9]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n9\nTABLE 3\nThe accuracy of symptom detection for non-local outliers in a\nhomogeneous cluster\nBenchmark Total\ntasksD Outliers\n(detected as Sd)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 32 29 90.63 3.9\nGrep 236 37 33 89.19 4.8\nTPC-H 102 13 12 92.31 6.72\nTPC-DS 126 13 12 92.31 6.1\nK-means 234 34 29 85.29 1.25\nPageRank 235 28 25 89.29 6.2\nTABLE 4\nThe accuracy of symptom detection for the outliers stemming from\nresource variation in a heterogeneous cluster\nBenchmark Total\ntasksR Outliers\n(detected as Sr)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 37 33 89.19 2.77\nGrep 236 26 24 92.31 4.77\nTPC-H 102 9 8 88.89 5.47\nTPC-DS 126 13 12 92.31 6.9\nK-means 234 36 33 91.67 2.88\nPageRank 235 30 28 93.33 5.35\n5.2 Diagnosis detection evaluation\nIn this section, we evaluate the accuracy of our symptom\ndetection method. To this end, we execute our benchmarks\ninEnvBto increase number of Sdtasks (see §4.2.2). Next,\nto increase the issue of resource heterogeneity (Sr referring\nto §4.2.3), we run the benchmarks in EnvC. Thereafter,\nwe run the benchmarks in EnvHto emulate the network\nfailure (Sw referring to §4.2.4). Finally, we compare the\ndetected Outlier tasks with the ground truths that are the\ndata locality, resource heterogeneity, and network failure\nissues observed by the AutoDiagn diagnosing system.\nTable 3, Table 4, and Table 5 summarize all the results. All\nbenchmarks achieve high accuracy by using our proposal\nsymptom detection method. The highest accuracy for both\nSdand Srare 92.3%, and for Swis 94.7% and the overall\naccuracy for outlier detection is 91.3%, where the Error\nrepresents the variation of the accuracy depending on the\nrepeated experiments.\nWe compute the accuracy of our symptom detection\nmethod by using the number of detected outlier tasks di-\nvided by the actual number of the tasks that can cause the\noutlier issue. Table 3, for example, Dis the total number of\nnon-local tasks and Outliers (Sd) is the number of detected\noutlier tasks that belong to non-local task. Therefore, the\naccuracy isSd\nD. Table 4 and Table 5 follow the same approach\nto compute the accuracy.\nOutlier verification. To further verify the Sd,Sr, and Sw\nare the main reasons causing the outliers, we conduct the\nfollowing comparison experiments: 1) comparing the exe-\ncution time of local tasks and non-local tasks; 2) comparing\nthe execution time of the tasks running in EnvAand Env\nC; and 3) comparing the execution time of normal tasks and\nrestarted tasks due to network failure. Fig. 6(a) proves that\nnon-local tasks consume more time than local tasks due to\nthe overload introduced by data shuffling. Additionally, weTABLE 5\nThe accuracy of symptom detection for the outliers stemming from\nnetwork failures\nBenchmark Total\ntasksW Outliers\n(detected as Sw)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 11 10 90.91 1.83\nGrep 236 13 12 92.31 6.73\nTPC-H 102 13 12 92.31 6.54\nTPC-DS 126 15 14 93.33 5.43\nK-means 234 17 16 94.12 4.33\nPageRank 235 19 18 94.74 4.23\ncompare the throughput of the local tasks and non-local\ntasks in terms of how much data can be processed in each\nsecond. Fig. 7 reveals that the throughput of non-local tasks\nis only 70% that of local tasks.\nMoreover, Fig. 6(b) shows that the execution time of\nthe tasks running on EnvAis less than that on EnvC.\nThis is because the tasks are equally distributed to all\ncomputing nodes and the less powerful nodes are saturated.\nFurthermore, Fig. 9(a) shows that the CPU usage of less\npowerful hosts reaches 100%, thereby building a task queue\nin these hosts, increasing the overall execution time. How-\never, Fig. 9(b) reveals that the powerful hosts have sufficient\ncomputing resources for processing the allocated tasks.\nFurthermore, Fig. 6(c) shows that the execution time of\nthe restarted tasks are longer than the normal tasks. As\nFig. 8 illustrates, we compute the execution time of the\nrestarted task by adding the execution time of the task in\nthe disconnected node and that in the rescheduled node.\n5.3 Performance and overheads\nPerformance evaluation. We evaluate the performance of\nAutoDiagn by measuring the end-to-end response time of\nsymptom detection and root-cause analysis. Since they are\nnot affected by the types of benchmark, we report the\naverage of the response time. Fig. 10(a) shows that the\nreal-time symptom detection can achieve a low response\ntime, which only has 96 milliseconds and 1059 milliseconds\nwith 100 tasks and 1000 tasks, respectively. Although the re-\nsponse time increases linearly, the parallel execution method\ndiscussed in §3.6 can be applied to reduce the latency. The\nresponse time for root cause analysis is higher than that\nof symptom detection. For 100 tasks and 1000 tasks, their\nresponse times are 0.354 seconds and 5.974 seconds, respec-\ntively. Unlike the symptom detection which is very sensitive\nto latency because of the follow-up processes, triggering the\nfurther root-cause analysis or alerting the system managers,\nRoot-cause analysis aims to provide a holistic diagnosing of\na big system and the analysis results may help to improve\nthe system performance in future. As a result, the real-time\nroot-cause analysis is not compulsory.\nSystem overheads. To evaluate the system overhead intro-\nduced by AutoDiagn, we measure the CPU and memory\nusage of AutoDiagn Monitoring (agent) and AutoDiagn\nDiagnosing. Table 6 shows that -AutoDiagn Monitoring only\nconsumes approximately 2.52% memory and 4.69% CPU;\nwhile -AutoDiagn Diagnosis uses 2.08% memory and 3.49%\nCPU.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 10]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n10\n 0 5 10 15 20 25 30 35 40\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingLocal tasks running on Env A\nNon-local tasks (D) running on Env B\n(a) Local tasks vs Non-local tasks\n 0 5 10 15 20 25 30 35 40\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingTasks running on Env A\nTasks (R) running on Env C (b) Homogeneous cluster vs Heterogeneous\ncluster\n 0 10 20 30 40 50 60\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingTasks running on Env A\nTasks (W) running on Env H(c) Normal tasks vs Restarted tasks caused by\nnetwork failure\nFig. 6. Comparison of execution time of the tasks\n 0 1 2 3 4 5 6\nWordCountGrepTPC-HTPC-DS K-means PageRankThroughput (MB/s)\nTypes of BenchmarkingLocal tasks Non-local tasks\nFig. 7. The throughput of AutoDiagn\n 0 20 40 60 80 100\n0510152025303540455055Progress (%)\nElapsed time (sec)\nFig. 8. The life cycle of the restarted task\nFig 10(b) shows the network overhead of AutoDiagn.\nThe extra communication cost introduced by our tool is\nsmall but it increases when the number of parallel tasks\nincreases. For example, when the number of parallel task is\n100, there are about 45 messages per second sent from agents\nto RabbitMQ cluster and the total size of these messages is\n13.5 KB/s. The message rate and network overhead increase\nto 615 per second and 223 KB/s, respectively, when the\nnumber of parallel tasks is 1000.\nStorage overheads. AutoDiagn needs to dump the system\ninformation to a database which may consume extra storage\nresource. In our evaluation experiments, it only cost 3.75\nMB disk space in total. Obviously, increasing the types\nof symptom detection and root cause analysis will also\nconsume more storage resources. We discuss the potentialTABLE 6\nResource overhead caused by AutoDiagn components\nComponents Mem (%) CPU (%)\nAutoDiagn Monitoring 2.52 4.69\nAutoDiagn Diagnosing 2.08 3.49\nfuture work in §6.\n6 D ISCUSSION AND FUTURE WORK\nPopulating applications. In this paper, we propose a gen-\neral and flexible framework to uncover the performance\nreduction issues in a big data system. In particular, we\ndevelop and evaluate big data applications for outliers. New\napplications (including symptom detection and root-cause\nanalysis) are required to populate our system for future\nwork.\nOverhead cost reduction. Our system is designed in a\nloosely-coupled manner, the processing components can\nbe easily scaled. However, the storage overhead increases\nwith the number of applications increasing. [15] proposed a\ncaching method to aggregate the information before sending\nto destination nodes. We will explore this direction in future\nwork to reduce the storage overhead and network overhead.\nPerformance improvement. Mantri [10] utilized the outputs\nof the root cause analysis to improve the resource allocation\nin Hadoop clusters. Thus, one open research direction is to\nbuild a system which can react to analysis results, thereby\nimproving the performance of the big data system.\n7 R ELATED WORK\nMuch recent work in big data systems focuses on improving\nworkflows [16], [17], [18], programming framework [19],\n[20], [21], task scheduling [22], [23], [24].\nRoot-cause analysis. There is a large volume of published\nstudies describing the role of root-cause analysis. The au-\nthors of [10], [25], [26] take the next step of understanding\nthe reasons for performance reduction. Mantri [10] charac-\nterizes the prevalence of stragglers in Hadoop systems as\nwell as troubleshooting the cause of stragglers. Dean and\nBarroso [25] analyze the issues causing tail latency in big\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 11]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n11\n 0 20 40 60 80 100 CPU usage (%)\nTimelineCPU utilization Outliers\n(a) CPU utilization of less powerful hosts and outliers\n 0 20 40 60 80 100 CPU usage (%)\nTimelineCPU utilization (b) CPU utilization of high power hosts\nFig. 9. CPU utilization of two nodes running simultaneously. Outliers are most likely to occur in the nodes which have less computing resource.\n 0 1 2 3 4 5 6\n50100 200 300 400 500 600 700 800 9001000Response time (sec)\nNumber of tasks running in parallelSymptom detectionRoot-cause analysis\n(a) The end-to-end response time of AutoDiagn diag-\nnosis system\n 0 100 200 300 400 500 600\n501002003004005006007008009001000 0 50 100 150 200 250Messages per second\nData rate (KB/s)\nNumber of tasks running in parallelMessage rates\nSize (KB/s)(b) The message rates and network overhead\nFig. 10. Performance evaluation and network overhead of AutoDiagn\ndata systems. Garraghan et al. [11], [27] proposed a new\nmethod to identify long tail behavior in big data systems\nand evaluated in google data trace. The authors in [28] use\noffline log analysis methods to identify the root cause of\noutliers in a large-scale cluster consisting of thousands of\nnodes by tracking the resource utilization. Similarly, Zhou\net al. [29] use a simple but efficient rule based method to\nidentify the root cause of stragglers.\nAlong with these similar works, there are some re-\nsearchers using statistical and machine learning methods for\nroot-cause analysis. The authors of [30] introduce a Regres-\nsion Neural Network (RNN) based algorithm to trouble-\nshoot the causes of stragglers by processing Spark logs.\nMore algorithms such as the associated tree and fuzzy data\nenvelopment analysis [31] and Reinforcement Learning [32]\nare applied for finding the reasons of stragglers in Hadoop\nand Spark.\nIn [33], a Pearson coefficient of correlation is used for\nroot cause analysis to measure linear correlation between\nsystem metrics, workload and latency. However, these\nworks lack a systematic solution for root cause analysis for\nbig data processing systems and the proposed methods are\nnot applicable for real-time systems.\nDifferent to other work, the authors of [34] propose a\nnew algorithm that aims to reduce the proportion of strag-\ngler tasks in machine learning systems that use gradient-\ndescent-like algorithms. This work offers an idea to develop\nnew Diagnosers for machine learning systems using our\nframework.\nAnomaly detection and debugging. The authors in [35] pro-\npose a rule-based approach to identify anomalous behaviorsin Hadoop ecosystems by analyzing the task logs. This\nwork only analyzes the task logs, which fails to capture the\nperformance reduction issues caused by inefficient utilizing\nthe underlying resources. Next, Khoussainova et al. [36]\nbuild a historical log analysis system to study and track\nthe MapReduce jobs which cause performance reduction\nbased on their relevance, precision and generality principles.\nHowever, this cannot be performed for real-time anomaly\ndetection. Du et al. [37] train a machine learning model from\nthe normal condition data by using Long Short-Term Mem-\nory (LSTM) and this trained model is used for detecting\nin Hadoop and OpenStack environments. Our AutoDiagn\nprovides infrastructure into which the trained models can\nbe plugged to enrich the applications.\nReal-time operational data analytic system. Agelastos et al.\n[38] propose a monitoring system for HPC systems, which\ncan capture the cases of applications competing for shared\nresources. However, this system does not consider root-\ncause analysis of the performance reduction. The authors\nof [5], [39] do not only provide the feature of real-time\nmonitoring, but are also able to identify the performance\nissues and trouble-shoot the cause of the issues. In addition\nto them, [40] uses a type of artificial neural network called\nautoencoder for anomaly detection. They first monitor the\nsystem in real-time and collect the normal data for training\nthe model used to discern between normal and abnormal\nconditions in an online fashion. However, these systems are\ndeveloped for HPC clusters and are not suitable for big data\nsystems.\nTable 7 presents a brief overview of various monitoring\ntools for big data frameworks.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 12]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n12\nTABLE 7\nThe features supported by existing tools and AutoDiagn\nFeature DataDog\n[2]Sequence\nIQ [3]Sematext\n[4]TACC\n[5]Mantri\n[10]DCDB\n[39]Nagios\n[41]Ganglia\n[42]Chukwa\n[43]DMon\n[44]AutoDiagn\nReal-time monitor-\ningYes Yes Yes Yes Yes Yes Yes Near\nreal-timeYes Near real-\ntimeYes\nRoot-cause analysis No No No No Yes Yes No No No Yes Yes\nBigData frameworks\nsupportGood Poor Good No Poor No Poor Poor Poor Good and\nExtensibleGood and\nExtensible\nUnderlying resource\nmonitoringYes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes\nReal-time monitor-\ning for big data tasksYes Yes Yes No Yes No No No Yes Yes Yes\nAuto-scaling Yes Yes Yes Yes Yes Yes No No Yes Yes Yes\nAlerts Yes No Yes No No No Yes No No No Yes\nVisualization of big\ndata tasksYes No Yes No No No No Yes No No Yes\nUser customized\nroot-cause analysisNo No No No No No No No No No Yes\n8 C ONCLUSION\nIn this paper, we have presented AutoDiagn, a framework\nfor enabling diagnosing of large-scale distributed systems\nto ascertain the root cause of outliers, with the core purpose\nof unravelling the concretization of complicated models\nfor system management. After making a comprehensive\nliterature review and identifying the requirements for real-\nworld problems, we conceived its design. The combination\nof user-defined functions powered by APIs and the agent-\nbased monitoring system along with the findings obtained\nfrom an empirical analysis of the experiments we conducted\nplay a fundamental role in the development of the system.\nAutoDiagn can be applied to most big data systems along\nwith the monitoring systems. We have also presented the\nimplementation and integration of the AutoDiagn system to\nthe SmartMonit [45], real-time big data monitoring system,\ncombined in our production environment. In our implemen-\ntation on a large cluster, we find AutoDiagn very effective\nand efficient.\nOutliers are one of the main problems in big data sys-\ntems that overwhelm the whole system and reduce perfor-\nmance considerably. AutoDiagn embraces this problem to\nreveal the bottlenecks alongside their root causes.\nACKNOWLEDGEMENT\nThis research is funded by the Turkish Ministry of Na-\ntional Education. This research is partially funded by\nthe following UKRI projects: SUPER (EP/T021985/1),\nPACE (EP/R033293/1), and Centre for Digital Citizens\n(EP/T022582/1). This work is also supported by the grant\nof National Natural Science Foundation of China (62072408)\nand Zhejiang Provincial Natural Science Foundation of\nChina (LY20F020030).\nREFERENCES\n[1] A. Noor, K. Mitra, E. Solaiman, A. Souza, D. N. Jha, U. Demirbaga,\nP . P . Jayaraman, N. Cacho, and R. Ranjan, “Cyber-physical appli-\ncation monitoring across multiple clouds,” Computers & Electrical\nEngineering, vol. 77, pp. 314–324, 2019.[2] Datadog. Accessed: 2020-07-13. [Online]. Available: https:\n//www.datadoghq.com/\n[3] Sequenceiq. Accessed: 2020-07-14. [Online]. Available: https:\n//github.com/sequenceiq\n[4] Sematext. Accessed: 2020-07-13. [Online]. Available: https:\n//sematext.com/\n[5] R. T. Evans, J. C. Browne, and W. L. Barth, “Understanding\napplication and system performance through system-wide moni-\ntoring,” in 2016 IEEE International Parallel and Distributed Processing\nSymposium Workshops (IPDPSW). IEEE, 2016, pp. 1702–1710.\n[6] G. Iuhasz, D. Pop, and I. Dragan, “Architecture of a scalable\nplatform for monitoring multiple big data frameworks,” Scalable\nComputing: Practice and Experience, vol. 17, no. 4, pp. 313–321, 2016.\n[7] I. Dr ˘agan, G. Iuhasz, and D. Petcu, “A scalable platform for\nmonitoring data intensive applications,” Journal of Grid Computing,\nvol. 17, no. 3, pp. 503–528, 2019.\n[8] S. Babu, “Towards automatic optimization of mapreduce pro-\ngrams,” in Proceedings of the 1st ACM symposium on Cloud com-\nputing, 2010, pp. 137–142.\n[9] R. S. Xin, J. Rosen, M. Zaharia, M. J. Franklin, S. Shenker, and\nI. Stoica, “Shark: Sql and rich analytics at scale,” in Proceedings of\nthe 2013 ACM SIGMOD International Conference on Management of\ndata, 2013, pp. 13–24.\n[10] G. Ananthanarayanan, S. Kandula, A. G. Greenberg, I. Stoica,\nY. Lu, B. Saha, and E. Harris, “Reining in the outliers in map-\nreduce clusters using mantri.” in Osdi, vol. 10, no. 1, 2010, p. 24.\n[11] P . Garraghan, X. Ouyang, P . Townend, and J. Xu, “Timely long\ntail identification through agent based monitoring and analytics,”\nin2015 IEEE 18th International Symposium on Real-Time Distributed\nComputing. IEEE, 2015, pp. 19–26.\n[12] J. Han, J. Pei, and M. Kamber, Data mining: concepts and techniques.\nElsevier, 2011.\n[13] T. Renner, L. Thamsen, and O. Kao, “Coloc: Distributed data and\ncontainer colocation for data-intensive applications,” in 2016 IEEE\nInternational Conference on Big Data (Big Data). IEEE, 2016, pp.\n3008–3015.\n[14] A. Rasooli and D. G. Down, “Guidelines for selecting hadoop\nschedulers based on system heterogeneity,” Journal of grid com-\nputing, vol. 12, no. 3, pp. 499–519, 2014.\n[15] A. Rabkin, M. Arye, S. Sen, V . S. Pai, and M. J. Freedman,\n“Aggregation and degradation in jetstream: Streaming analytics in\nthe wide area,” in 11thfUSENIXg Symposium on Networked Systems\nDesign and Implementation (fNSDIg 14), 2014, pp. 275–288.\n[16] Z. Wen, T. Lin, R. Yang, S. Ji, R. Ranjan, A. Romanovsky, C. Lin,\nand J. Xu, “Ga-par: Dependable microservice orchestration frame-\nwork for geo-distributed clouds,” IEEE Transactions on Parallel and\nDistributed Systems, vol. 31, no. 1, pp. 129–143, 2019.\n[17] Z. Wen, J. Cała, P . Watson, and A. Romanovsky, “Cost effective,\nreliable and secure workflow deployment over federated clouds,”\nIEEE Transactions on Services Computing, vol. 10, no. 6, pp. 929–941,\n2016.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 13]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n13\n[18] Z. Wen, R. Qasha, Z. Li, R. Ranjan, P . Watson, and A. Romanovsky,\n“Dynamically partitioning workflow over federated clouds for\noptimising the monetary cost and handling run-time failures,”\nIEEE Transactions on Cloud Computing, 2016.\n[19] G. Malewicz, M. H. Austern, A. J. Bik, J. C. Dehnert, I. Horn,\nN. Leiser, and G. Czajkowski, “Pregel: a system for large-scale\ngraph processing,” in Proceedings of the 2010 ACM SIGMOD Inter-\nnational Conference on Management of data, 2010, pp. 135–146.\n[20] M. Zaharia, M. Chowdhury, M. J. Franklin, S. Shenker, I. Stoica\net al., “Spark: Cluster computing with working sets.” HotCloud,\nvol. 10, no. 10-10, p. 95, 2010.\n[21] M. Abadi, P . Barham, J. Chen, Z. Chen, A. Davis, J. Dean, M. Devin,\nS. Ghemawat, G. Irving, M. Isard et al., “Tensorflow: A system for\nlarge-scale machine learning,” in 12thfUSENIXg symposium on\noperating systems design and implementation (fOSDIg 16), 2016, pp.\n265–283.\n[22] M. Isard, V . Prabhakaran, J. Currey, U. Wieder, K. Talwar, and\nA. Goldberg, “Quincy: fair scheduling for distributed computing\nclusters,” in Proceedings of the ACM SIGOPS 22nd symposium on\nOperating systems principles, 2009, pp. 261–276.\n[23] N. J. Yadwadkar and W. Choi, “Proactive straggler avoidance\nusing machine learning,” White paper, University of Berkeley, 2012.\n[24] A. Badita, P . Parag, and V . Aggarwal, “Optimal server selection\nfor straggler mitigation,” IEEE/ACM Transactions on Networking ,\nvol. 28, no. 2, pp. 709–721, 2020.\n[25] J. Dean and L. A. Barroso, “The tail at scale,” Communications of the\nACM, vol. 56, no. 2, pp. 74–80, 2013.\n[26] K. Ousterhout, R. Rasti, S. Ratnasamy, S. Shenker, and B.-G. Chun,\n“Making sense of performance in data analytics frameworks,”\nin12thfUSENIXg Symposium on Networked Systems Design and\nImplementation (fNSDIg 15), 2015, pp. 293–307.\n[27] P . Garraghan, X. Ouyang, R. Yang, D. McKee, and J. Xu, “Straggler\nroot-cause and impact analysis for massive-scale virtualized cloud\ndatacenters,” IEEE Transactions on Services Computing, vol. 12, no. 1,\npp. 91–104, 2016.\n[28] X. Ouyang, P . Garraghan, R. Yang, P . Townend, and J. Xu, “Re-\nducing late-timing failure at scale: Straggler root-cause analysis in\ncloud datacenters,” in Fast Abstracts in the 46th Annual IEEE/IFIP\nInternational Conference on Dependable Systems and Networks. DSN,\n2016.\n[29] H. Zhou, Y. Li, H. Yang, J. Jia, and W. Li, “Bigroots: An effective\napproach for root-cause analysis of stragglers in big data system,”\nIEEE Access, vol. 6, pp. 41 966–41 977, 2018.\n[30] S. Lu, X. Wei, B. Rao, B. Tak, L. Wang, and L. Wang, “Ladra:\nLog-based abnormal task detection and root-cause analysis in big\ndata processing with spark,” Future Generation Computer Systems,\nvol. 95, pp. 392–403, 2019.\n[31] Z. He, Y. He, F. Liu, and Y. Zhao, “Big data-oriented product infant\nfailure intelligent root cause identification using associated tree\nand fuzzy dea,” IEEE Access, vol. 7, pp. 34 687–34 698, 2019.\n[32] H. Du and S. Zhang, “Hawkeye: Adaptive straggler identification\non heterogeneous spark cluster with reinforcement learning,”\nIEEE Access, vol. 8, pp. 57 822–57 832, 2020.\n[33] J. P . Magalh ˜aes and L. M. Silva, “Root-cause analysis of perfor-\nmance anomalies in web-based applications,” in Proceedings of the\n2011 ACM Symposium on Applied Computing, 2011, pp. 209–216.\n[34] R. Bitar, M. Wootters, and S. El Rouayheb, “Stochastic gradient\ncoding for straggler mitigation in distributed learning,” IEEE\nJournal on Selected Areas in Information Theory, vol. 1, no. 1, pp.\n277–291, 2020.\n[35] A. M. Chacko, J. S. Medicherla, and S. M. Kumar, “Anomaly\ndetection in mapreduce using transformation provenance,” in\nAdvances in Big Data and Cloud Computing. Springer, 2018, pp.\n91–99.\n[36] N. Khoussainova, M. Balazinska, and D. Suciu, “Perfx-\nplain: debugging mapreduce job performance,” arXiv preprint\narXiv:1203.6400, 2012.\n[37] M. Du, F. Li, G. Zheng, and V . Srikumar, “Deeplog: Anomaly\ndetection and diagnosis from system logs through deep learning,”\ninProceedings of the 2017 ACM SIGSAC Conference on Computer and\nCommunications Security, 2017, pp. 1285–1298.\n[38] A. Agelastos, B. Allan, J. Brandt, P . Cassella, J. Enos, J. Fullop,\nA. Gentile, S. Monk, N. Naksinehaboon, J. Ogden et al., “The\nlightweight distributed metric service: a scalable infrastructure\nfor continuous monitoring of large scale computing systems and\napplications,” in SC’14: Proceedings of the International Conferencefor High Performance Computing, Networking, Storage and Analysis.\nIEEE, 2014, pp. 154–165.\n[39] A. Netti, M. M ¨uller, C. Guillen, M. Ott, D. Tafani, G. Ozer, and\nM. Schulz, “Dcdb wintermute: Enabling online and holistic op-\nerational data analytics on hpc systems,” in Proceedings of the 29th\nInternational Symposium on High-Performance Parallel and Distributed\nComputing, 2020, pp. 101–112.\n[40] A. Borghesi, A. Bartolini, M. Lombardi, M. Milano, and L. Benini,\n“Anomaly detection using autoencoders in high performance\ncomputing systems,” in Proceedings of the AAAI Conference on\nArtificial Intelligence, vol. 33, 2019, pp. 9428–9433.\n[41] Nagios. Accessed: 2020-07-15. [Online]. Available: https://www.\nnagios.org/\n[42] Ganglia. Accessed: 2020-07-15. [Online]. Available: http://ganglia.\ninfo/\n[43] Apache chukwa. Accessed: 2020-07-14. [Online]. Available:\nhttps://chukwa.apache.org/\n[44] Dmon. Accessed: 2020-07-12. [Online]. Available: https://github.\ncom/Open-Monitor/dmon\n[45] U. Demirbaga, A. Noor, Z. Wen, P . James, K. Mitra, and R. Ranjan,\n“Smartmonit: Real-time big data monitoring system,” in 2019 38th\nSymposium on Reliable Distributed Systems (SRDS). IEEE, 2019, pp.\n357–3572.\nUmit Demirbaga (Member, IEEE) is a PhD stu-\ndent in the School of Computing, Newcastle\nUniversity, UK. He received an MSc degree in\nComputer Science from Newcastle University,\nUK in 2017 and the BSc degree in Electronics\nand Computer Education from Marmara Univer-\nsity, Turkey in 2011. His research interests in-\nclude big data analytics, cloud computing and\ndistributed systems. He was awarded Outstand-\ning Performance Award with Best Team Project\nAward in his MSc in 2017.\nZhenyu Wen (Member, IEEE) received MSc and\nPhD degrees in Computer Science from New-\ncastle University, Newcastle upon Tyne, UK, in\n2011 and 2016, respectively. He is currently a\nPostdoc Researcher with the School of Com-\nputing, Newcastle University, UK. His current re-\nsearch interests include IoT, crowd sources, AI\nsystem, and cloud computing. For his contribu-\ntions to the area of scalable data management\nfor the Internet of Things. He was awarded the\nIEEE TCSC Award for Excellence in Scalable\nComputing (Early Career Researchers) in 2020.\nAyman Noor is a PhD student in Computer\nScience at Newcastle University, UK. His cur-\nrent research interests include cloud computing,\nmonitoring, and machine learning. He earned a\nMaster of Science in Computer and Information\nScience from Gannon University, PA, USA in\n2013 and a Bachelor in Computer Science from\nthe College of Computer Science and Engineer-\ning from Taibah University, Madinah, SA in 2006.\nKaran Mitra is an Assistant Professor at Lule ˚a\nUniversity of Technology, Sweden. He received\nhis Dual-badge PhD from Monash University,\nAustralia and Lule ˚a University of Technology in\n2013. His research interests include cloud and\nmobile cloud computing, performance bench-\nmarking of distributed systems, context-aware\ncomputing and QoE. He is a member of the IEEE\nand ACM.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.\n\n[Página 14]\n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n14\nKhaled Alwasel has a BS and MS in informa-\ntion technology from Indiana University-Purdue\nUniversity Indianapolis (2014) and Florida Inter-\nnational University (2015), USA. He is currently\nworking toward a PhD in the School of Com-\nputing Science at Newcastle University (UK).\nKhaled’s interests lie in the areas of software-\ndefined networking (SDN), big data, IoT, edge\ncomputing, and cloud computing\nSaurabh Garg is a lecturer at the University of\nTasmania, Hobart, Tasmania. He has published\nmore than 30 papers in highly cited journals\nand conferences with H-index 24. He has gained\nabout three years of experience in industrial re-\nsearch while working at IBM Research Australia\nand India. His areas of interest are distributed\ncomputing, cloud computing, HPC, IoT, big data\nanalytics, and education analytics.\nAlbert Y. Zomaya is currently the Chair Pro-\nfessor of High Performance Computing & Net-\nworking in the School of Computer Science,\nUniversity of Sydney. He is also the Director of\nthe Centre for Distributed and High Performance\nComputing which was established in late 2009.\nProfessor Zomaya was an Australian Research\nCouncil Professorial Fellow during 2010-2014\nand held the CISCO Systems Chair Professor\nof Internetworking during the period 2002–2007\nand also was Head of School for 2006–2007.\nRajiv Ranjan is a Full professor in Comput-\ning Science at Newcastle University, UK. Before\nmoving to Newcastle University, he was Julius\nFellow (2013-2015), Senior Research Scientist\nand Project Leader in the Digital Productivity and\nServices Flagship of Commonwealth Scientific\nand Industrial Research Organization (CSIRO\nC Australian Government’s Premier Research\nAgency). Prior to that he was a Senior Research\nAssociate (Lecturer level B) in the School of\nComputer Science and Engineering, University\nof New South Wales (UNSW). Dr Ranjan has a PhD (2009) from\nthe department of Computer Science and Software Engineering, the\nUniversity of Melbourne.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply.",
+ "4eee3406-0542-45ea-afdc-870a7ac4dd41": {
+ "content": "0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n1\nAutoDiagn: An Automated Real-time Diagnosis\nFramework for Big Data Systems\nUmit Demirbaga, Zhenyu Wen\u0003Member, IEEE , Ayman Noor, Karan Mitra, Member, IEEE , Khaled\nAlwasel, Saurabh Garg, Albert Zomaya, Fellow, IEEE , Rajiv Ranjan, Senior Member, IEEE\nAbstract—Big data processing systems, such as Hadoop and Spark, usually work in large-scale, highly-concurrent, and multi-tenant\nenvironments that can easily cause hardware and software malfunctions or failures, thereby leading to performance degradation.\nSeveral systems and methods exist to detect big data processing systems’ performance degradation, perform root-cause analysis, and\neven overcome the issues causing such degradation. However, these solutions focus on specific problems such as stragglers and\ninefficient resource utilization. There is a lack of a generic and extensible framework to support the real-time diagnosis of big data\nsystems. In this paper, we propose, develop and validate AutoDiagn. This generic and flexible framework provides holistic monitoring of\na big data system while detecting performance degradation and enabling root-cause analysis. We present an implementation and\nevaluation of AutoDiagn that interacts with a Hadoop cluster deployed on a public cloud and tested with real-world benchmark\napplications. Experimental results show that AutoDiagn can offer a high accuracy root-cause analysis framework, at the same time as\noffering a small resource footprint, high throughput and low latency.\nIndex Terms—Root-cause analysis, Big data systems, QoS, Hadoop, Performance\nF\n1 I NTRODUCTION\nThe rapid surge of data generated through sectors like\nsocial media, financial services and industries has led to\nthe emergence of big data systems. Big data systems enable\nthe processing of massive amounts of data in relatively\nshort time frames. For instance, the Netflix big data pipeline\nprocesses approximately 500 billion events and 1.3 petabytes\n(PB) of data per day, further, during peak hours, it processes\napproximately 11 million events and 24 gigabytes (GB) of\ndata on a per-second basis. Facebook has one of the largest\ndata warehouses in the world, capable of executing more\nthan 30,000 queries over 300 PB data every day. However,\nthe enormousness and complexity of the big data system\nruns in heterogeneous computing resources, multiple tenant\nenvironments, as well as has many concurrent execution of\nbig data processing tasks, which makes it a challenge to\nutilize the big data systems efficiently and reliably[1]. For\nexample, Fig. 1 shows that the performance degrades at\nleast 10% when the resources are not utilized efficiently with\nSetting 2.\n\u000fU. Demirbaga is with Newcastle University, United Kingdom and Bartin\nUniversity, Turkey. E-mail: u.demirbaga2@newcastle.ac.uk\n\u000fZ. Wen is with Newcastle University, United Kingdom. E-mail:\nzhenyu.wen@newcastle.ac.uk, corresponding author.\n\u000fA. Noor is with Newcastle University, United Kingdom and Taibah\nUniversity, Saudi Arabia. E-mail: anoor@taibahu.edu.sa\n\u000fK. Mitra is with Lule˚ a University of Technology, Sweden. E-mail:\nkaran.mitra@ltu.se\n\u000fK. Alwasel is with Newcastle University, United Kingdom and Saudi\nElectronic University, Saudi Arabia. E-mail: kalwasel@gmail.com\n\u000fS. Garg is with University of Tasmania, Australia. E-\nmail:Saurabh.Garg@utas.edu.au\n\u000fA. Zomaya is with Sydney University, Australia, E-mail: al-\nbert.zomaya@sydney.edu.au\n\u000fR. Ranjan is with Newcastle University, United Kingdom. E-mail:\nraj.ranjan@newcastle.ac.uk\n 0 50 100 150 200 250 300 350\nWordCountGrepTPC-HTPC-DS K-means PageRankMakespan (sec)\nBig data applicationsSetting 1 Setting 2Fig. 1. Six big data applications are executed in a cloud-based Hadoop\ncluster with two settings: 1) the input data and jobs are allocated in\nthe same node; 2) the input data and jobs are allocated in different\nnodes. In Setting 2, the execution time of each application is delayed\nby transmitting data across nodes.\nTo overcome this, it is imperative to continuously mon-\nitor and analyze all available system resources at all times\nin a systematic, holistic and automated manner. These re-\nsources include CPU, memory, network, I/O and the big\ndata processing software components.\nMost of the commercial [2][3][4] and academic big\ndata monitoring systems mainly focus on visualizing task\nprogress, and the system’s resource utilization [5]. How-\never, they do not focus on the interaction between multiple\nfactors and performing root-cause analysis for performance\ndegradation [6][7]. Moreover, works such as [8], [9] aim to\nfind the best parameters to optimize the performance of\nManuscript received ???; revised ???\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n2\nbig data processing systems, they do not focus on the root-\ncause analysis that may indicate the viable reasons behind\nperformance degradation and may provide intuitions for\nparameter tweaking.\nMantri [10] presents a systematic method that catego-\nrizes the main reasons causing outliers in a big data system.\nThe authors’ work was focused on the MapReduce pro-\ngramming framework in the Hadoop system; they do not\ndiscuss how Mantri can be applied to other big processing\nframeworks (e.g., Apache Spark1, and Apache Flink2). Gar-\nraghan et al. [11] proposed an online solution to detect long-\ntail issues in a distributed system. However, these solutions\nwere built for specific scenarios with much scope left for\nanalyzing a variety of problems that can exist in a large\nscale big data processing system.\nTo the best of our knowledge, there is a lack of a generic\nand comprehensive solution for the detection of a wide\nrange of anomalies and performance of root-cause analysis\nin big data systems. Developing a general and extensible\nframework for diagnosing a big data system is not trivial.\nIt requires well-defined requirements which could enable\nthe broader adoption of root-cause analysis for the big\ndata systems, flexible APIs to interact with an underlying\nmonitoring system and integration of multiple solutions for\ndetecting performance reduction problems while enabling\nthe automatic root-cause analysis. In this paper, we tackle\nthis research gap, and design and develop AutoDiagn to au-\ntomatically detect performance degradation and inefficient\nresource utilization problems, while providing an online\ndetection and semi-online root-cause analysis for a big data\nsystem. Further, it is designed as a microservice architecture\nthat offers the flexibility to plug a new detection and root-cause\nanalysis module for various types of big data systems.\nThe contributions of this paper are as follows:\n\u000fAn online and generic framework: We develop a general\nframework called AutoDiagn which can be adapted for\nthe detection of a wide range of performance degrada-\ntion problems while pinpointing their root-causes in big\ndata systems.\n\u000fA case study: We develop a novel real-time stream pro-\ncessing method to detect symptoms regarding outliers\nin a big data system. After that, we develop a set of\nquery APIs to analyze the reasons that cause the outlier\nregarding a task.\n\u000fA comprehensive evaluation: We evaluate the feasibility,\nscalability and accuracy of AutoDiagn through a set of\nreal-world benchmarks over a real-world cloud cluster.\nThe paper is organized as follows. The design require-\nments and idea are outlined in §2. In §3, we illustrate the\nhigh-level system architecture. §4 presents a case study that\nwe implemented and the case study is evaluated in §5. §6\ndiscusses the limitations of this paper and highlights our\nfurther work . Before drawing a conclusion in §8, we discuss\nthe related work in §7.\n1. https://spark.apache.org/\n2. https://flink.apache.org/2 R EQUIREMENTS AND DESIGN IDEA\nIn this section, we analyze the key requirements of the\nreal-time big data diagnosis system, extracting the essential\nfeatures from the literature. Next, we present the key idea\nof the framework design.\n2.1 Fundamental prerequisite for diagnosing big data\nprocessing systems\nIn order to design a generic framework for diagnosing big\ndata processing systems, we classified the fundamental re-\nquirements of building a diagnosis system on such systems\nas follows:\n\u000fInfrastructure monitoring: Collecting the information\nabout the underlying system, such as network condi-\ntions, CPU utilization, memory utilization, and disk\nI/O status.\n\u000fTask execution monitoring: Collecting the task infor-\nmation, including execution time, progress, location,\nlocation of its input data, input data size, output data\nsize, CPU/memory usage, and process state (running,\nwaiting, succeeded, failed, killed).\n\u000fAbnormal behavior or fault detection: Detecting ab-\nnormal behaviors in big data processing systems, such\nas slowing tasks, failed tasks, very high/low resource\nusage, and experiencing very high response time for the\nrequests.\n\u000fRoot-cause analysis: Finding the root cause of perfor-\nmance reduction in big data processing systems, such\nas the reasons why: tasks are slowing down, resource\nutilization is low, the response time is high, or when the\nnetwork latency is high.\n\u000fVisualization: Visualizing the collected metrics and\nthe results of root-cause analysis of any failures caus-\ning performance reduction in the cluster with a user-\nfriendly interface in real-time.\n2.2 Key design idea\nMotivated by the above-mentioned requirements and in-\nspired by medical diagnosis, we highlight the design idea\nof root-cause analysis for big data processing systems as\nshown Fig. 2, which aims to provide holistic monitoring\nand root cause analysis for big data processing systems.\nFirst, a set of Symptom Detectors is defined and developed in\nSymptom Detection to detect the abnormalities of the big\nsystem by processing collected system information stream\nin real-time. Once a symptom (abnormality) is detected,\ntheDiagnosis Management may launch the corresponding\nDiagnosers to troubleshoot the cause of the symptom. One\nsymptom may correspond to root causes. Finally, the deci-\nsions are made based on the root-cause analysis results.\n2.3 The generalizability of AutoDiagn\nModern big data processing systems consists of two main\ntypes: Big data analytics (e.g., Hadoop, Spark) and Stream\nprocessing (e.g., Flink, Spark Stream). Based on our de-\nsign idea, our AutoDiagn is an independent framework\nthat can be deployed alongside existing big data cluster\nmanagement systems (e.g., Apache YARN), and ideally it\nis suitable for root-cause analysis of any big data processing\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n3\nsystem. However, for the scope of this paper and practi-\ncal certainty, the implementation of AutoDiagn focuses on\ndebugging root causes of performance degradation (e.g.,\nslow task execution time) in Hadoop due to faults such as\ndata locality, cluster hardware heterogeneity, and network\nproblems (e.g., disconnection). Although we have validated\nthe functionality of AutoDiagn in the context of Hadoop and\nconsidering different classes of workload (e.g., WordCount,\nGrep, TPC-H, TPC-DC, K-means clustering, PageRank), it is\ngeneralizable to other big data processing systems executing\nsimilar classes of workload.\n3 A UTODIAGN ARCHITECTURE\nFollowing the design idea laid out in §2, we introduce Auto-\nDiagn, a novel big data diagnosing system. We first illustrate\nthe high-level system architecture and then describe the\ndetails of each component. AutoDiagn is implemented in\nJava and all source code is open-source on GitHub3.\n3.1 Architecture overview\nAutoDiagn provides a systematic solution that automati-\ncally monitors the performance of big data systems while\ntroubleshooting the issues that cause performance reduc-\ntion. Fig. 3 shows its two main components: AutoDiagn\nMonitoring and AutoDiagn Diagnosing. AutoDiagn Monitoring\ncollects the defined metrics (logs) and feeds AutoDiagn Diag-\nnosing with them in real-time. Once the abnormal symptoms\nare detected by analyzing the collected metrics, a deeper\nanalysis is conducted to troubleshoot the cause of abnormal\nsymptoms.\nAutoDiagn Monitoring. AutoDiagn Monitoring is a de-\ncentralized real-time stream processing system that collects\ncomprehensive system information from the big data system\n(e.g., Hadoop Cluster). The Collected Metrics is a set of\npre-defined monitoring entities (e.g., CPU usage, memory\nusage, task location, task status) used to detect the abnormal\nsymptoms. Moreover, the system information, required for\nunderstanding the cause of detected abnormal symptoms,\nis collected in this modular.\nAutoDiagn Diagnosing. AutoDiagn Diagnosing is an event\nbased diagnosing system. First, the carefully crafted metrics\nare injected into the Symptom Detection Engine which is a\nreal-time stream processing module to detect the abnormal\nsymptoms in a big data system. In this paper, we use\nthe outlier which is a common symptom for performance\nreduction in a Hadoop cluster as a case study to demon-\nstrate the proposed framework. §4.1 illustrates the details\nof technology that we developed for symptom detection.\nMoreover, our system follows the principle of modular\nprogramming; the new symptom detection method can be\neasily plugged in. Diagnoser Plugins is a component for\ntrouble-shooting the reasons behind the detected symptom.\nA set of Diagnosers is instantiated by the Diagnoser Manager\nwhen their corresponding symptoms are detected. Then\nthe instantiated Diagnosers query a time series database to\nobtain the required input and their outputs illustrate the\ncause of the detected symptoms.\n3. https://github.com/umitdemirbaga/AutoDiagn3.2 AutoDiagn monitoring framework\nAutoDiagn monitoring framework is a holistic solution for\ncontinuous information collection in a big data cluster.\nThe framework needs to have a fast, flexible and dynamic\npipeline to transfer the collected data as well as a high per-\nformance, large scale storage system. We now describe an\nimplementation of the framework for a big data computer\ncluster, and the high-level system architecture is shown in\nFig. 4.\nInformation Collection. In each compute node, we develop\nand deploy an Agent to collect real-time system information.\nFor the worker node, the Agent collects the usage of com-\nputing resource via SIGAR APIs4, including CPU, memory,\nnetwork bandwidth, and disk read/write speeds. Moreover,\ntheAgent in the master node collects the usage of computing\nresource as well as the job and tasks information. The Filter\nis developed by using GSon Library5to remove the less im-\nportant information obtained from ResourceManager REST\nAPI’s6, thereby reducing the size of data transmission. The\ncollected information is sent to RabbitMQ7cluster which is\na lightweight and easy-to-deploy messaging system in each\ntime interval via Publisher.\nStorage. The acquired information is time series data, we\ntherefore choose InfluxDB8for data storage. InfluxDB is a\nhigh performance, scalable and open source time series data\nbase which provides a set of flexible open APIs for real-time\nanalytics. The Consumer subscribes the related stream topics\nfrom RabbitMQ and interacts with InfluxDB APIs to inject\nthe information to the data base.\nInteracting with AutoDiagn Diagnosing. The information\nrequired for symptom detection is directly forwarded and\nprocessed in AutoDiagn diagnosing via a consumer. If a\nsymptom is detected, InfluxDB will be queried by AutoDi-\nagn diagnosing for root-cause analysis. Finally, the analysis\nresults are sent back to the database to be stored.\nUser visualization. The user visualization allows the users\nto have a visible way to monitor their big data system. We\nutilize InfluxDB’s client libraries and develop a set of REST-\nful APIs to allow the users to query various information,\nincluding resource utilization, job and task status, as well as\nroot cause of performance reduction.\n3.3 AutoDiagn diagnosing framework\nIn this section, we discuss the core components of the\nAutoDiagn Diagnosing framework (see Fig. 3), as well as the\ninteractions with each other and the AutoDiagn Monitoring\nframework.\nSymptom Detection Engine. The symptom detection en-\ngine subscribes a set of metrics from the real-time streaming\nsystem. §4.1 illustrates the technique that we developed\nfor outlier detection. This component follows microservices\narchitecture to which new symptom detection techniques\ncan be directly attached to our AutoDiagn, interacting with\nother existing techniques to detect new symptoms.\n4. https://github.com/hyperic/sigar\n5. https://github.com/google/gson\n6. https://hadoop.apache.org/docs/r3.2.1/hadoop-yarn\n7. https://www.rabbitmq.com/\n8. https://www.influxdata.com/\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n4\nSymptom Detection Diagnosis Management Decision MakingSymptoms \n(N)\nRoot -cause 1\n•\n•\n••\n•\n•Root -cause 2\nRoot -cause M•\n•\n•Root -cause 3Diagnosis \n(M)\nRoot -cause 4MetricsSymptom \nDetector 2\nSymptom \nDetector NSymptom \nDetector 1Diagnoser 1\nDiagnoser 2\nDiagnoser 3\nDiagnoser 4\nDiagnoser MDecision 1\nDecision 2\n•\n•\n•\nDecision N Root -cause M -1 Diagnoser M -1\nFig. 2. The key design idea of root-cause analysis for big data processing systems\nAutoDiagn Diagnosing\nDiagnoser Plugins\nDiagnoser 1\nTask\nInput\nOutput\n…\nDiagnoser N\nTask\nInput\nOutputAutoDiagn Monitoring\nSymptom\nDetection\nEngine\nDiagnosis \ndecisionsCollected \nmetricsDetected\nSymptoms\nRoot -causes of the symptoms \nDiagnoser \nManager\nFig. 3. The high-level architecture of the AutoDiagn system\nDiagnoser Manager. The diagnoser manager is the core\nentity responsible for selecting the right diagnosers to find\nthe reasons that cause the detected symptoms. Additionally,\nthe diagnoser manager is developed as a front-end com-\nponent, triggered by various detected symptoms (events)\nvia a RESTful API, exposing all diagnosing actions within\nour framework. The API includes general actions such as\nstarting, stopping or loading a diagnoser dynamically, and\nspecific actions such as retrieving some metrics. Importantly,\nthe diagnoser manager is able to compose a set of diagnosers\nto complete the diagnosing jobs that may require the coop-\neration of different diagnosers.\nDiagnoser Plugins. The diagnoser plugin contains a set of\ndiagnosers; and a diagnoser is the implementation of the\nspecific logic to perform root-cause analysis of a symptom.\nEach diagnoser refers to a set of metrics stored in a time\nseries database as the input of its analysis logic. Whenever\nit is activated by the diagnoser manager, it will perform\nan analysis, querying the respective metrics, executing the\nanalytic algorithm, and storing the results. §4.2 discusses the\nalgorithms to detect the outlier problems, for example, in aHadoop cluster. The diagnoser plugin is also designed as\na microservice architecture which has two advantages: i) a\nnew diagnoser can be conveniently plugged or unplugged\non-the-fly without affecting other components; ii) new root-\ncause analysis tasks can be composed by a set of diagnosers\nvia RESTful APIs.\n3.4 AutoDiagn diagnosing interfaces for Hadoop\nAutoDiagn exposes a set of simple interfaces for system\nmonitoring, symptom detection and root-cause analysis.\nTable 1 shows that two types of APIs are defined: high-\nlevel APIs and low-level APIs. The high-level APIs consist\nofSymptom Detection, Diagnoser and Decision Making.\nThe Symptom Detection APIs are a set of real-time stream\nprocessing functions used to detect the defined symptoms\ncausing the performance reduction in the Hadoop system.\nEach Diagnoser is a query or a set of queries, which aim\nto find one of the causes of a symptom. For example,\nQueryNonLocal() tries to find all non-local tasks within a\ntime interval, which is one of the reasons that causes an out-\nlier. Finally, the Decision Making APIs are used to analyze\nthe results from each Diagnoser and make the conclusion.\nThese high-level APIs have to interact with the low-level\nAPIs (Information Collection) to obtain system information\nincluding resource usage, and the execution information of\nthe big data system (e.g., ask and job status in a Hadoop\nsystem). Based on this flexible design, users can define\nand develop their own Symptom Detection, Diagnoser and\nDecision Making APIs and plug them into AutoDiagn.\n3.5 Example applications\nWe now discuss several examples for big data system root\ncause applications using AutoDiagn API.\nOutliers. Outliers are the tasks that take longer to finish\nthan other similar tasks, which may prevent the subse-\nquent tasks from making progress. To detect these tasks,\nthe real-time stream query QueryOutlier() is enabled\nin the Symptom Detection Engine. This function consumes\neach task’s completion rate (i.e., progress) and the executed\ntime to identify the outlier tasks (detailed in §4.1). Next,\nthree APIs QueryNonlocal(), QueryLessResource()\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n5\nComputer Cluster\nMaster Node\nPublisher FilterCollector AgentResource\nInformationTask\nInformation\n…Message \nBrokerAutoDiagn \nDiagnosingManagement Node\nUser \nVisualization\n StorageConsumer\nConsumer\nWorker Node 1Publisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nWorker Node 2Publisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nWorker Node NPublisher\nCollectorAgent\nResource\nInformation\n…Task 1 Task N\nFig. 4. The high-level architecture of the monitoring framework\nandQueryNodeHealth(), corresponding to three Diag-\nnosers that are used to analyze the reasons causing the de-\ntected symptom, are executed. QueryNonlocal() queries\nwhether the input data is allocated on the node on which\nan outlier task is processed. In addition, QueryLessRe-\nsource() investigates whether outlier tasks are running\non the nodes that have less available resource. Moreover,\nQueryNodeHealth() examines if an outlier task is the\ntask that is a restarted task due to the disconnected nodes\nfrom the network. Finally, RootcauseOutlier() is used\nto process the results from the three Diagnosers and make\nthe conclusion. All the APIs are shown in Table 1 and the\ntechnical details are illustrated in §4.\nInefficient resource utilization. In our case this means that\nsome tasks are pending (or waiting) to be on worker nodes;\nat the same time, some worker nodes are idle, e.g., low CPU\nand memory usage. There are many reasons that cause this\nissue, but here we consider two key causes: task heterogeneity\nand resource heterogeneity. The type of tasks in a big data sys-\ntem are various, including CPU intensive tasks, IO intensive\ntasks and memory intensive tasks. However, the underlying\ncomputing resources are typically equally distributed to\nthese tasks, thereby causing inefficient resource utilization.\nThe latter is caused by the heterogeneous underlying com-\nputing resources due to the multiple concurrent processing\ntask environments and the queues are built on the saturated\nnodes.\nTo detect the inefficient resource utilization in a big data\nsystem, the real-time stream query QueryResourceU-\ntil() is used within a defined time interval. We com-\npute the mean and standard deviation of the usage re-\nsources of the whole cluster. If the standard deviation\nis far from the mean, we will further query whether\nthe tasks are queued on the nodes which have high\nresource usage rates. If inefficient resource utilization\nis detected, two Diagnosers, QueryOversubscribed()\nand QueryDiskIOboundTasks(), which are the root-\ncause analysis APIs shown in Table 1, are executed toperform root-cause analysis. QueryOversubscribed()\nchecks the type of tasks queuing on the saturated nodes.\nTheQueryDiskIOboundTasks() checks whether the sat-\nurated nodes have less available computing resource,\nwhile processing the allocated tasks. The conclusion of the\ncause of inefficient resource utilization is made in Root-\ncauseResInef().\n3.6 Parallel execution\nFollowing the key design idea, the diagnosers are triggered\nby the corresponding detected symptom. However, we are\nable to parallelize the execution of each symptom detector\nand its diagnosers by partitioning the input data. For ex-\nample, if one symptom detector needs to process too many\ndata streams, we can use two of the same instances of the\nsymptom detector to process the data streams and aggregate\nthe results from two symptom detectors. The diagnoser can\nfollow the same strategy for parallel execution.\n3.7 Reliability analysis\nAutoDiagn follows the centralized design for data collec-\ntion, which simplifies the implementation of the Symptom\nDetection, Diagnosis Management and Decision Making. They\ncan easily obtain the required information from one place,\ninstead of interacting with the entire big data system. More-\nover, the centralized design does not mean unreliability, due\nto the high-availability of RabbitMQ. The RabbitMQ cluster\ncan overcome the node fail in the message queuing system\nwhile ensuring scalability.\n4 C ASESTUDY\nIn the previous section, we have discussed that our frame-\nwork supports detection of multiple types of symptoms\n(e.g., outliers, inefficient resource utilization). However, de-\ntecting these symptoms is non-trivial; and each symptom\ncan be detected by using different algorithms with different\ninput metrics. In this section, we present a case study that\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n6\nTABLE 1\nAutoDiagn diagnosing interface. See §3.4 for definitions and examples\nSymptom Detection (High-level APIs) Description\nQueryOutlier() Execute a Query that returns the list of outliers if any.\nQueryResourceUtil() Execute a Query that returns the list of the worker nodes in which the computing resources are not uti-\nlized effectively if any.\nDiagnoser (High-level APIs) Description\nQueryNonLocal() Execute a Query that return the list of non-local tasks if any.\nQueryLessResource() Execute a Query that returns false if the cluster is not homogeneous in terms of having resource capacity (CPU/memory).\nQueryNodeHealth() Execute a Query that returns the list of disconnected worker nodes in the cluster if any.\nQueryOversubscribed() Execute a Query that returns the list of the oversubscribed tasks if any.\nQueryDiskIOboundTasks() Execute a Query that returns the list of the disk- or IO-bound tasks if any.\nDecision Making (High-level APIs) Description\nRootcauseOutlier() Execute a Query that illustrate the main reason of the cause of the outlier.\nRootcauseResInef() Execute a Query that illustrate the main reason of the cause of inefficient resource utilization.\nInformation Collection (Low-level APIs) Description\ntaskExecTime() Return the execution time since the task started in sec.\ntaskProgress() Return the progress of the running task as a percentage.\ntaskInput() Return the input data size of the running task in mb.\ntaskBlock() Return the block id this task process.\ntaskHost() Return the name of the node thistask ran on.\ntaskCPUusage() Return the CPU usage of the task.\ntaskMemoryUsage() Return the memory usage of the task.\ntaskContainerCPU() Return the allocated CPU to the container this task ran on.\ntaskContainerMemory() Return the allocated memory to the container this task ran on.\nblockHost() Return the names of the nodes that host the block.\npendingTasks() Return the number of the tasks waiting to be run.\nnodeTotalCoreNum() Return the number of the CPU core number of the node.\nnodeCPUUsage() Return the CPU utilization of the node.\nnodeTotalMem() Return the total memory capacity of the node.\nrestartedTasks() Return the name of the restarted tasks due to nodes that got disconnected from the network.\nnodeMemUsage() Return the memory utilization of the node.\nnodeDiskReadSpeed() Return the disk read speed of the node.\nnodeDiskWriteSpeed() Return the disk write speed of the node.\nnodeUploadSpeed() Return the network upload speed of the node.\nnodeDownloadSpeed() Return the network download speed of the node.\ndetails the technology of detecting outliers and the root-\ncauses analysis for the detected outliers. The notations used\nin this paper are summarized in Table 2.\nTABLE 2\nA summary of symbols used in the paper\nSymbols Description\nJp Job progress\nN Name of the task\nNl List ofN\nP Performance of the N\nPl List ofP\nO Progress of theN\nOl List ofO\nT Execution time of the N\nTl List ofT\nmed The performance of median task\nD Non-local tasks\nDl List of Non-local task\nR Task running on the node with less resources\nRl List ofR\nW Restarted tasks due to the nodes’ network failure\nWl List ofW\nSl List of outlier task\nSd Non-local outlier\nSdl List of Sd\nSr Outlier stemming from the resource variation\nSrl List of Sr\nSw Outlier stemming from disconnected nodes\nSwl List of Sw\nF Factor value of 1.5 used to find the S4.1 Symptom detection for outliers\nAnanthanarayanan et al. [10] defined the outlier tasks’ run-\ntime to be 1.5 times higher than that of the median task\nexecution time; their method is based on the assumption\nthat all tasks are started at the same time and are the same\ntype (i.e., the same input data and the same processing\ncode), which is not suitable for real-time symptom detection,\nbecause in a time interval the tasks may be submitted at\ndifferent times; the input data size of the tasks and the code\nfor tasks are not always the same. In this paper, we use\nPerformance (P) to measure the outlier as shown in Eq 1. O\nrepresents the normalized value of the task progress in terms\nof percent work complete, and Tis the normalized value of\nthe task execution time.\nP=O\nT(1)\nEq 2 is used to normalize the OandT, where xmin and\nxmax are the minimal and maximal values of the given\nmetrics (eg., task progress and execution time) in a time\ninterval. We set b= 1 anda= 0:1 to restrict the normalized\nvalues within the range from 0.1 to 1 [12].\nxnorm =a+(x\u0000xmin)(b\u0000a)\nxmax\u0000xmin(2)\nMoreover, we define the outlier tasks which have 1.5\ntimes less performance value than the median performance\nvalue in each time interval. Fig. 5 shows a snapshot of a time\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n7\nAlgorithm 1: Automated symptom detection for\noutliers\nInput: Jp- job progress in percentage,\nF- factor,\nN- name of the running task,\nNl- list ofN,\nO- progress of the task,\nOl- list ofO,\nT- execution time of the task,\nTl- list ofT.\nOutput: Sl- list of outliersS.\n1// Create a list Slto store theS\n2Sl Sl[0]\n3// Initialize the med\n4med med[0]\n5while Jp<100.0 do\n6 //Clear the SlandPl\n7 Sl Clear (Snew\nl ,Sl)\n8 Pl Clear (Pnew\nl ,Pl)\n9 foreachNinNldo\n10 //ComputeP\n11P=O\nT\n12 //Insert thePinto the Pl\n13 Pl.add(P )\n14 end\n15 //Get themedfrom thePl\n16 med Median value of Pl\n17 foreach value of Pldo\n18 if(P*F)< m edthen\n19 //Insert theNinto theSl\n20 Sl.add(N )\n21 end\n22 end\n23 //Update the SlinDiagnosis Generation component\n24 Sl Update (Snew\nl ,Sl)\n25 //Update the Nl,Ol,Tl,Jp\n26 Nl Replace (Nnew\nl ,Nl)\n27 Ol Replace (Onew\nl ,Ol)\n28 Tl Replace (Tnew\nl ,Tl)\n29 Jp Replace (Jnew\np ,Jp)\n30end\ninterval (e.g., three seconds), and two mappers are identified\nas outliers. More evaluations will be discussed in §5.\nAlgorithm 1 demonstrates the proposed ASD (auto-\nmated symptom detection) algorithm in the AutoDiagn\nsystem. It is fed by the streaming data provided by the\nAutoDiagn Monitoring system during job execution. First,\nthe performance of each running task is calculated (see\nAlgorithm 1, Line 11) using Eq 1. Next, the median value\nof the performance of all tasks is taken to be used to detect\noutliers (see Algorithm 1, Line 16). Then, the tasks whose\nperformance is 1.5 times less than the performance of the\nmedian task are selected as outliers (see Algorithm 1, Line\n20). As a final step, these tasks detected as outliers are sent to\ntheDiagnosis Generation component for root-cause analysis\n(see Algorithm 1, Line 24).\n4.2 Root cause analysis for outliers\nWhen the detected symptoms are passed to the Diagnoser\nManager, the corresponding Diagnosers are executed for\ntrouble-shooting. The following subsection illustrates the\ntechnologies that we have developed for analyzing the\ncauses of outliers in a Hadoop cluster.\n4.2.1 Root cause of outliers\nIn this paper, we follow the three main reasons that cause\noutliers, discussed in [10], i.e., Data locality, Resource het-\nerogeneity, and Network failures.\nProgress (%)Execution time (sec) 0 1 2OutliersMedian=1.11Performance levels \n 30 35 40 45 50 55 60 65 14 16 18 20 22 24 26 28 30 32\nPerformance 0.2 0.4 0.6 0.8 1 1.2 1.4\nFig. 5. Performance evaluation of the tasks\nData locality. Hadoop Distributed File System (HDFS)\nstores the data in a set of machines. If a task is scheduled to\na machine which does not store its input data, moving data\nover the network may introduce some overheads to cause\nthe outliers issue.\nResource heterogeneity. The machines in a Hadoop cluster\nmay be homogeneous with the same hardware configura-\ntion, but the run-time computing resources are very hetero-\ngeneous due to the multiple talents environment, multiple\nconcurrent processing task environment, machine failures,\nmachine overloaded etc. If a task is scheduled to a bad\nmachine (e.g., has less computing resource) it may cause\nan outlier issue. Moreover, resource management systems\nfor a large-scale cluster like YARN split the tasks over the\nnodes equally without considering the resource capacities of\nthe nodes in the cluster, but only takes into account sharing\nthe node’s resources among the tasks running on the node\nequally by default [13]. That is more likely to raise an outlier\nproblem in the cluster.\nNetwork failure. In Hadoop clusters, the network discon-\nnection can cause the running tasks allocated on a discon-\nnected node to be restarted on other nodes, which may lead\nto the task becoming an outlier and, increase the completion\ntime. The following illustrates the three algorithms that\nwe developed to identify the outliers caused by the three\nreasons.\n4.2.2 Detecting data locality issues\nWe assume that a non-local task (D ) (e.g., mapper) is ex-\necuted on a node where its input data is not stored (In the\nfollowing, we use Sdto represent non-local outliers). To detect\nthese tasks, we develop Algorithm 2 to check whether a set\nof outliers is caused by a data locality issue. The input of\nour algorithm is a list of detected outliers during the time\ninterval from ttot+ 1 and one of its outputs is a list of\noutliers which also belongs to the non-local tasks. First, we\nquery our time series database to obtain all non-local tasks\nwithin the given time interval (see Algorithm 2, Line 2).\nHere, QueryNonLocal(), a root-cause analysis API, is\nused to find the non-local ones among the running tasks\nin that period of time. It compares the location where the\ntask is running (host node of the task) with the nodes\nwhere the data block is replicated for fault tolerance via\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n8\ninformation collection APIs shown in Table 1, taskHost()\nandblockHost(). If the task is not running on any of\nthese nodes (nodes hosting a copy of the block), this task\nis marked as a non-local task. In the second step (Algorithm\n2, Line 4), we obtain the common elements of list DlandSl.\nThese elements symbolize the non-local outliers stemming\nfrom a data locality issue.\n4.2.3 Detecting resource heterogeneity issues\nAlgorithm 2 is designed to identify the outliers caused by\nthe resource heterogeneity. The tasks running on the nodes\nwhich have less computing resource (R ) tend to be outliers\n[14] (in the following, we use Srto represent outliers running\non the nodes which have less computing resource). In Algorithm\n2, the list of detected outliers during the time interval from\nttot+ 1 is used as input and one of the outputs of the\nalgorithm is a list of outliers which also belongs to the tasks\nrunning on the node with less computing resource. The time\nseries database is queried to obtain all the tasks running on\nthe node with less computing resource within the given time\ninterval (see Algorithm 2, Line 6).\nHere, QueryLessResource(), a root-cause analysis\nAPI, is used to check the heterogeneity of the nodes that host\nonly the running tasks based on the resource specifications\nof them in that period of time. It detects the nodes with less\nresource capacity in terms of CPU core numbers and the to-\ntal amount of memory among the nodes hosting the running\ntasks. The resource specifications of the nodes (i.e., CPU\ncore numbers, total amount of memory) are obtained from\neach node via information collection APIs shown in Table 1,\nnodeTotalCoreNum() andnodeTotalMem() APIs. As a\nsecond step (Algorithm 2, Line 8), we obtain the common\nelements of list RlandSl. These elements symbolize the\noutliers stemming from a cluster heterogeneity issue.\n4.2.4 Detecting network failure issues\nSince Slis obtained from Algorithm 1, a Diagnoser is exe-\ncuted via QueryNodeHealth() to find all restarted tasks\ndue to the nodes disconnected by network failure within the\ngiven time interval (see Algorithm 2, Line 10). The low-level\nAPIrestartedTasks() is called which distinguishes the\nrestarted tasks due to network failure from the speculation\nof straggler tasks by analyzing the information of the tasks\nthat is provided by the monitoring agent. Thereafter, we\ncompute the list Swlthat contains the outlier tasks caused\nby the network failure (see Algorithm 2, Line 12).\n4.2.5 Decision making\nIn this case study, we use a simple decision make method\nthat compares the lists Sdl,SrlandSwland the probability\nof the reasons causing the outliers by using the number\nof the elements of a list divided the total number of out-\nlier tasks. For instance, the probability of the performance\nreduction caused by data locality isjSdlj\njSlj. More advanced\nmethods such as deep learning models can be used for pro-\ncessing more complicated decision making tasks in future\nwork.Algorithm 2: Root-cause analysis of outliers\nInput: Sl- list of outliers in time interval from ttot+ 1\nOutput: Sdl- list of non-local outliers Sd,\nSrl- list of outliers stemming from resource variation Sr,\nSwl- list of outliers stemming from disconnected nodes Sw.\n1// Find allDwithin the given time interval\n2Dl QueryNonLocal(t, t+1)\n3//Find the common elements in the DlandSl, and add them\ninto theSdl\n4Sdl RetainAll (Dl,Sl)\n5// Find allRwithin the given time interval\n6Rl QueryLessResource(t, t+1)\n7//Find the common elements in the RlandSl, and add them\ninto theSll\n8Srl RetainAll (Rl,Sl)\n9// Find allWwithin the given time interval\n10Wl QueryNodeHealth(t, t+1)\n11//Find the common elements in the WlandSl, and add them\ninto theSwl\n12Swl RetainAll (Wl,Sl)\n5 E VALUATION\nIn this section, we present a comprehensive evaluation\nshowing the capacity and the accuracy rate of AutoDiagn,\nas well as a analysis of its resource consumption and over-\nheads.\n5.1 Experimental setup\nEnvironments. We set up the Hadoop YARN clusters over\n31 AWS nodes with 1 master and 30 slaves with the Oper-\nating system of each node being Ubuntu Server 18.04 LTS\n(HVM). The Hadoop version is 3.2.1 and the Hive version\nis 3.1.1. To meet our experimental requirements, we built\ntwo types of cluster. In Type I each node has the same\nconfiguration (i.e., 4 cores and 16 GB memory). In Type II,\n25 nodes have 4 cores and 16 GB memory and 6 nodes have\n2 cores and 4 GB memory.\nBenchmarks and workload. We used six well-known\nHadoop benchmarks in our evaluations namely: Word-\nCount9, Grep10, TPC-H11, TPC-DS12, K-means clustering13,\nand PageRank14. The input of each benchmark application\nis 30GB.\nMethodology. Our experiments aim to evaluate the effec-\ntiveness of AutoDiagn. To this end, we manually inject the\nabove-mentioned three main reasons to cause the outliers,\nwhich can be summarized as three types of execution en-\nvironment. EnvA: we perform all benchmark experiments\nin the cluster Type I. EnvB: we perform all benchmark\nexperiments in the cluster Type I, but skew the input size\nstored on different nodes. EnvC: we perform all benchmark\nexperiments in the cluster Type II (a heterogeneous cluster).\nEnvH: we perform all benchmark experiments in the cluster\nType I, and disconnect some nodes’ network during execu-\ntion. Each benchmarking is repeated 5 times and results are\nreported as the average and standard deviation. In total,\nthere are 90 experiments conducted in our evaluation.\n9. http://wiki.apache.org/hadoop/WordCount\n10. http://wiki.apache.org/hadoop/Grep\n11. http://www.tpc.org/tpch/\n12. http://www.tpc.org/tpcds/\n13. https://en.wikipedia.org/wiki/K-means clustering\n14. https://en.wikipedia.org/wiki/PageRank\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n9\nTABLE 3\nThe accuracy of symptom detection for non-local outliers in a\nhomogeneous cluster\nBenchmark Total\ntasksD Outliers\n(detected as Sd)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 32 29 90.63 3.9\nGrep 236 37 33 89.19 4.8\nTPC-H 102 13 12 92.31 6.72\nTPC-DS 126 13 12 92.31 6.1\nK-means 234 34 29 85.29 1.25\nPageRank 235 28 25 89.29 6.2\nTABLE 4\nThe accuracy of symptom detection for the outliers stemming from\nresource variation in a heterogeneous cluster\nBenchmark Total\ntasksR Outliers\n(detected as Sr)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 37 33 89.19 2.77\nGrep 236 26 24 92.31 4.77\nTPC-H 102 9 8 88.89 5.47\nTPC-DS 126 13 12 92.31 6.9\nK-means 234 36 33 91.67 2.88\nPageRank 235 30 28 93.33 5.35\n5.2 Diagnosis detection evaluation\nIn this section, we evaluate the accuracy of our symptom\ndetection method. To this end, we execute our benchmarks\ninEnvBto increase number of Sdtasks (see §4.2.2). Next,\nto increase the issue of resource heterogeneity (Sr referring\nto §4.2.3), we run the benchmarks in EnvC. Thereafter,\nwe run the benchmarks in EnvHto emulate the network\nfailure (Sw referring to §4.2.4). Finally, we compare the\ndetected Outlier tasks with the ground truths that are the\ndata locality, resource heterogeneity, and network failure\nissues observed by the AutoDiagn diagnosing system.\nTable 3, Table 4, and Table 5 summarize all the results. All\nbenchmarks achieve high accuracy by using our proposal\nsymptom detection method. The highest accuracy for both\nSdand Srare 92.3%, and for Swis 94.7% and the overall\naccuracy for outlier detection is 91.3%, where the Error\nrepresents the variation of the accuracy depending on the\nrepeated experiments.\nWe compute the accuracy of our symptom detection\nmethod by using the number of detected outlier tasks di-\nvided by the actual number of the tasks that can cause the\noutlier issue. Table 3, for example, Dis the total number of\nnon-local tasks and Outliers (Sd) is the number of detected\noutlier tasks that belong to non-local task. Therefore, the\naccuracy isSd\nD. Table 4 and Table 5 follow the same approach\nto compute the accuracy.\nOutlier verification. To further verify the Sd,Sr, and Sw\nare the main reasons causing the outliers, we conduct the\nfollowing comparison experiments: 1) comparing the exe-\ncution time of local tasks and non-local tasks; 2) comparing\nthe execution time of the tasks running in EnvAand Env\nC; and 3) comparing the execution time of normal tasks and\nrestarted tasks due to network failure. Fig. 6(a) proves that\nnon-local tasks consume more time than local tasks due to\nthe overload introduced by data shuffling. Additionally, weTABLE 5\nThe accuracy of symptom detection for the outliers stemming from\nnetwork failures\nBenchmark Total\ntasksW Outliers\n(detected as Sw)Accuracy\n(%)Error\n(\u001b)\nWordCount 234 11 10 90.91 1.83\nGrep 236 13 12 92.31 6.73\nTPC-H 102 13 12 92.31 6.54\nTPC-DS 126 15 14 93.33 5.43\nK-means 234 17 16 94.12 4.33\nPageRank 235 19 18 94.74 4.23\ncompare the throughput of the local tasks and non-local\ntasks in terms of how much data can be processed in each\nsecond. Fig. 7 reveals that the throughput of non-local tasks\nis only 70% that of local tasks.\nMoreover, Fig. 6(b) shows that the execution time of\nthe tasks running on EnvAis less than that on EnvC.\nThis is because the tasks are equally distributed to all\ncomputing nodes and the less powerful nodes are saturated.\nFurthermore, Fig. 9(a) shows that the CPU usage of less\npowerful hosts reaches 100%, thereby building a task queue\nin these hosts, increasing the overall execution time. How-\never, Fig. 9(b) reveals that the powerful hosts have sufficient\ncomputing resources for processing the allocated tasks.\nFurthermore, Fig. 6(c) shows that the execution time of\nthe restarted tasks are longer than the normal tasks. As\nFig. 8 illustrates, we compute the execution time of the\nrestarted task by adding the execution time of the task in\nthe disconnected node and that in the rescheduled node.\n5.3 Performance and overheads\nPerformance evaluation. We evaluate the performance of\nAutoDiagn by measuring the end-to-end response time of\nsymptom detection and root-cause analysis. Since they are\nnot affected by the types of benchmark, we report the\naverage of the response time. Fig. 10(a) shows that the\nreal-time symptom detection can achieve a low response\ntime, which only has 96 milliseconds and 1059 milliseconds\nwith 100 tasks and 1000 tasks, respectively. Although the re-\nsponse time increases linearly, the parallel execution method\ndiscussed in §3.6 can be applied to reduce the latency. The\nresponse time for root cause analysis is higher than that\nof symptom detection. For 100 tasks and 1000 tasks, their\nresponse times are 0.354 seconds and 5.974 seconds, respec-\ntively. Unlike the symptom detection which is very sensitive\nto latency because of the follow-up processes, triggering the\nfurther root-cause analysis or alerting the system managers,\nRoot-cause analysis aims to provide a holistic diagnosing of\na big system and the analysis results may help to improve\nthe system performance in future. As a result, the real-time\nroot-cause analysis is not compulsory.\nSystem overheads. To evaluate the system overhead intro-\nduced by AutoDiagn, we measure the CPU and memory\nusage of AutoDiagn Monitoring (agent) and AutoDiagn\nDiagnosing. Table 6 shows that -AutoDiagn Monitoring only\nconsumes approximately 2.52% memory and 4.69% CPU;\nwhile -AutoDiagn Diagnosis uses 2.08% memory and 3.49%\nCPU.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n10\n 0 5 10 15 20 25 30 35 40\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingLocal tasks running on Env A\nNon-local tasks (D) running on Env B\n(a) Local tasks vs Non-local tasks\n 0 5 10 15 20 25 30 35 40\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingTasks running on Env A\nTasks (R) running on Env C (b) Homogeneous cluster vs Heterogeneous\ncluster\n 0 10 20 30 40 50 60\nWordCountGrepTPC-HTPC-DS K-means PageRankExecution time (sec)\nTypes of BenchmarkingTasks running on Env A\nTasks (W) running on Env H(c) Normal tasks vs Restarted tasks caused by\nnetwork failure\nFig. 6. Comparison of execution time of the tasks\n 0 1 2 3 4 5 6\nWordCountGrepTPC-HTPC-DS K-means PageRankThroughput (MB/s)\nTypes of BenchmarkingLocal tasks Non-local tasks\nFig. 7. The throughput of AutoDiagn\n 0 20 40 60 80 100\n0510152025303540455055Progress (%)\nElapsed time (sec)\nFig. 8. The life cycle of the restarted task\nFig 10(b) shows the network overhead of AutoDiagn.\nThe extra communication cost introduced by our tool is\nsmall but it increases when the number of parallel tasks\nincreases. For example, when the number of parallel task is\n100, there are about 45 messages per second sent from agents\nto RabbitMQ cluster and the total size of these messages is\n13.5 KB/s. The message rate and network overhead increase\nto 615 per second and 223 KB/s, respectively, when the\nnumber of parallel tasks is 1000.\nStorage overheads. AutoDiagn needs to dump the system\ninformation to a database which may consume extra storage\nresource. In our evaluation experiments, it only cost 3.75\nMB disk space in total. Obviously, increasing the types\nof symptom detection and root cause analysis will also\nconsume more storage resources. We discuss the potentialTABLE 6\nResource overhead caused by AutoDiagn components\nComponents Mem (%) CPU (%)\nAutoDiagn Monitoring 2.52 4.69\nAutoDiagn Diagnosing 2.08 3.49\nfuture work in §6.\n6 D ISCUSSION AND FUTURE WORK\nPopulating applications. In this paper, we propose a gen-\neral and flexible framework to uncover the performance\nreduction issues in a big data system. In particular, we\ndevelop and evaluate big data applications for outliers. New\napplications (including symptom detection and root-cause\nanalysis) are required to populate our system for future\nwork.\nOverhead cost reduction. Our system is designed in a\nloosely-coupled manner, the processing components can\nbe easily scaled. However, the storage overhead increases\nwith the number of applications increasing. [15] proposed a\ncaching method to aggregate the information before sending\nto destination nodes. We will explore this direction in future\nwork to reduce the storage overhead and network overhead.\nPerformance improvement. Mantri [10] utilized the outputs\nof the root cause analysis to improve the resource allocation\nin Hadoop clusters. Thus, one open research direction is to\nbuild a system which can react to analysis results, thereby\nimproving the performance of the big data system.\n7 R ELATED WORK\nMuch recent work in big data systems focuses on improving\nworkflows [16], [17], [18], programming framework [19],\n[20], [21], task scheduling [22], [23], [24].\nRoot-cause analysis. There is a large volume of published\nstudies describing the role of root-cause analysis. The au-\nthors of [10], [25], [26] take the next step of understanding\nthe reasons for performance reduction. Mantri [10] charac-\nterizes the prevalence of stragglers in Hadoop systems as\nwell as troubleshooting the cause of stragglers. Dean and\nBarroso [25] analyze the issues causing tail latency in big\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n11\n 0 20 40 60 80 100 CPU usage (%)\nTimelineCPU utilization Outliers\n(a) CPU utilization of less powerful hosts and outliers\n 0 20 40 60 80 100 CPU usage (%)\nTimelineCPU utilization (b) CPU utilization of high power hosts\nFig. 9. CPU utilization of two nodes running simultaneously. Outliers are most likely to occur in the nodes which have less computing resource.\n 0 1 2 3 4 5 6\n50100 200 300 400 500 600 700 800 9001000Response time (sec)\nNumber of tasks running in parallelSymptom detectionRoot-cause analysis\n(a) The end-to-end response time of AutoDiagn diag-\nnosis system\n 0 100 200 300 400 500 600\n501002003004005006007008009001000 0 50 100 150 200 250Messages per second\nData rate (KB/s)\nNumber of tasks running in parallelMessage rates\nSize (KB/s)(b) The message rates and network overhead\nFig. 10. Performance evaluation and network overhead of AutoDiagn\ndata systems. Garraghan et al. [11], [27] proposed a new\nmethod to identify long tail behavior in big data systems\nand evaluated in google data trace. The authors in [28] use\noffline log analysis methods to identify the root cause of\noutliers in a large-scale cluster consisting of thousands of\nnodes by tracking the resource utilization. Similarly, Zhou\net al. [29] use a simple but efficient rule based method to\nidentify the root cause of stragglers.\nAlong with these similar works, there are some re-\nsearchers using statistical and machine learning methods for\nroot-cause analysis. The authors of [30] introduce a Regres-\nsion Neural Network (RNN) based algorithm to trouble-\nshoot the causes of stragglers by processing Spark logs.\nMore algorithms such as the associated tree and fuzzy data\nenvelopment analysis [31] and Reinforcement Learning [32]\nare applied for finding the reasons of stragglers in Hadoop\nand Spark.\nIn [33], a Pearson coefficient of correlation is used for\nroot cause analysis to measure linear correlation between\nsystem metrics, workload and latency. However, these\nworks lack a systematic solution for root cause analysis for\nbig data processing systems and the proposed methods are\nnot applicable for real-time systems.\nDifferent to other work, the authors of [34] propose a\nnew algorithm that aims to reduce the proportion of strag-\ngler tasks in machine learning systems that use gradient-\ndescent-like algorithms. This work offers an idea to develop\nnew Diagnosers for machine learning systems using our\nframework.\nAnomaly detection and debugging. The authors in [35] pro-\npose a rule-based approach to identify anomalous behaviorsin Hadoop ecosystems by analyzing the task logs. This\nwork only analyzes the task logs, which fails to capture the\nperformance reduction issues caused by inefficient utilizing\nthe underlying resources. Next, Khoussainova et al. [36]\nbuild a historical log analysis system to study and track\nthe MapReduce jobs which cause performance reduction\nbased on their relevance, precision and generality principles.\nHowever, this cannot be performed for real-time anomaly\ndetection. Du et al. [37] train a machine learning model from\nthe normal condition data by using Long Short-Term Mem-\nory (LSTM) and this trained model is used for detecting\nin Hadoop and OpenStack environments. Our AutoDiagn\nprovides infrastructure into which the trained models can\nbe plugged to enrich the applications.\nReal-time operational data analytic system. Agelastos et al.\n[38] propose a monitoring system for HPC systems, which\ncan capture the cases of applications competing for shared\nresources. However, this system does not consider root-\ncause analysis of the performance reduction. The authors\nof [5], [39] do not only provide the feature of real-time\nmonitoring, but are also able to identify the performance\nissues and trouble-shoot the cause of the issues. In addition\nto them, [40] uses a type of artificial neural network called\nautoencoder for anomaly detection. They first monitor the\nsystem in real-time and collect the normal data for training\nthe model used to discern between normal and abnormal\nconditions in an online fashion. However, these systems are\ndeveloped for HPC clusters and are not suitable for big data\nsystems.\nTable 7 presents a brief overview of various monitoring\ntools for big data frameworks.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n12\nTABLE 7\nThe features supported by existing tools and AutoDiagn\nFeature DataDog\n[2]Sequence\nIQ [3]Sematext\n[4]TACC\n[5]Mantri\n[10]DCDB\n[39]Nagios\n[41]Ganglia\n[42]Chukwa\n[43]DMon\n[44]AutoDiagn\nReal-time monitor-\ningYes Yes Yes Yes Yes Yes Yes Near\nreal-timeYes Near real-\ntimeYes\nRoot-cause analysis No No No No Yes Yes No No No Yes Yes\nBigData frameworks\nsupportGood Poor Good No Poor No Poor Poor Poor Good and\nExtensibleGood and\nExtensible\nUnderlying resource\nmonitoringYes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes\nReal-time monitor-\ning for big data tasksYes Yes Yes No Yes No No No Yes Yes Yes\nAuto-scaling Yes Yes Yes Yes Yes Yes No No Yes Yes Yes\nAlerts Yes No Yes No No No Yes No No No Yes\nVisualization of big\ndata tasksYes No Yes No No No No Yes No No Yes\nUser customized\nroot-cause analysisNo No No No No No No No No No Yes\n8 C ONCLUSION\nIn this paper, we have presented AutoDiagn, a framework\nfor enabling diagnosing of large-scale distributed systems\nto ascertain the root cause of outliers, with the core purpose\nof unravelling the concretization of complicated models\nfor system management. After making a comprehensive\nliterature review and identifying the requirements for real-\nworld problems, we conceived its design. The combination\nof user-defined functions powered by APIs and the agent-\nbased monitoring system along with the findings obtained\nfrom an empirical analysis of the experiments we conducted\nplay a fundamental role in the development of the system.\nAutoDiagn can be applied to most big data systems along\nwith the monitoring systems. We have also presented the\nimplementation and integration of the AutoDiagn system to\nthe SmartMonit [45], real-time big data monitoring system,\ncombined in our production environment. In our implemen-\ntation on a large cluster, we find AutoDiagn very effective\nand efficient.\nOutliers are one of the main problems in big data sys-\ntems that overwhelm the whole system and reduce perfor-\nmance considerably. AutoDiagn embraces this problem to\nreveal the bottlenecks alongside their root causes.\nACKNOWLEDGEMENT\nThis research is funded by the Turkish Ministry of Na-\ntional Education. This research is partially funded by\nthe following UKRI projects: SUPER (EP/T021985/1),\nPACE (EP/R033293/1), and Centre for Digital Citizens\n(EP/T022582/1). This work is also supported by the grant\nof National Natural Science Foundation of China (62072408)\nand Zhejiang Provincial Natural Science Foundation of\nChina (LY20F020030).\nREFERENCES\n[1] A. Noor, K. Mitra, E. Solaiman, A. Souza, D. N. Jha, U. Demirbaga,\nP . P . Jayaraman, N. Cacho, and R. Ranjan, “Cyber-physical appli-\ncation monitoring across multiple clouds,” Computers & Electrical\nEngineering, vol. 77, pp. 314–324, 2019.[2] Datadog. Accessed: 2020-07-13. [Online]. Available: https:\n//www.datadoghq.com/\n[3] Sequenceiq. Accessed: 2020-07-14. [Online]. Available: https:\n//github.com/sequenceiq\n[4] Sematext. Accessed: 2020-07-13. [Online]. Available: https:\n//sematext.com/\n[5] R. T. Evans, J. C. Browne, and W. L. Barth, “Understanding\napplication and system performance through system-wide moni-\ntoring,” in 2016 IEEE International Parallel and Distributed Processing\nSymposium Workshops (IPDPSW). IEEE, 2016, pp. 1702–1710.\n[6] G. Iuhasz, D. Pop, and I. Dragan, “Architecture of a scalable\nplatform for monitoring multiple big data frameworks,” Scalable\nComputing: Practice and Experience, vol. 17, no. 4, pp. 313–321, 2016.\n[7] I. Dr ˘agan, G. Iuhasz, and D. Petcu, “A scalable platform for\nmonitoring data intensive applications,” Journal of Grid Computing,\nvol. 17, no. 3, pp. 503–528, 2019.\n[8] S. Babu, “Towards automatic optimization of mapreduce pro-\ngrams,” in Proceedings of the 1st ACM symposium on Cloud com-\nputing, 2010, pp. 137–142.\n[9] R. S. Xin, J. Rosen, M. Zaharia, M. J. Franklin, S. Shenker, and\nI. Stoica, “Shark: Sql and rich analytics at scale,” in Proceedings of\nthe 2013 ACM SIGMOD International Conference on Management of\ndata, 2013, pp. 13–24.\n[10] G. Ananthanarayanan, S. Kandula, A. G. Greenberg, I. Stoica,\nY. Lu, B. Saha, and E. Harris, “Reining in the outliers in map-\nreduce clusters using mantri.” in Osdi, vol. 10, no. 1, 2010, p. 24.\n[11] P . Garraghan, X. Ouyang, P . Townend, and J. Xu, “Timely long\ntail identification through agent based monitoring and analytics,”\nin2015 IEEE 18th International Symposium on Real-Time Distributed\nComputing. IEEE, 2015, pp. 19–26.\n[12] J. Han, J. Pei, and M. Kamber, Data mining: concepts and techniques.\nElsevier, 2011.\n[13] T. Renner, L. Thamsen, and O. Kao, “Coloc: Distributed data and\ncontainer colocation for data-intensive applications,” in 2016 IEEE\nInternational Conference on Big Data (Big Data). IEEE, 2016, pp.\n3008–3015.\n[14] A. Rasooli and D. G. Down, “Guidelines for selecting hadoop\nschedulers based on system heterogeneity,” Journal of grid com-\nputing, vol. 12, no. 3, pp. 499–519, 2014.\n[15] A. Rabkin, M. Arye, S. Sen, V . S. Pai, and M. J. Freedman,\n“Aggregation and degradation in jetstream: Streaming analytics in\nthe wide area,” in 11thfUSENIXg Symposium on Networked Systems\nDesign and Implementation (fNSDIg 14), 2014, pp. 275–288.\n[16] Z. Wen, T. Lin, R. Yang, S. Ji, R. Ranjan, A. Romanovsky, C. Lin,\nand J. Xu, “Ga-par: Dependable microservice orchestration frame-\nwork for geo-distributed clouds,” IEEE Transactions on Parallel and\nDistributed Systems, vol. 31, no. 1, pp. 129–143, 2019.\n[17] Z. Wen, J. Cała, P . Watson, and A. Romanovsky, “Cost effective,\nreliable and secure workflow deployment over federated clouds,”\nIEEE Transactions on Services Computing, vol. 10, no. 6, pp. 929–941,\n2016.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n13\n[18] Z. Wen, R. Qasha, Z. Li, R. Ranjan, P . Watson, and A. Romanovsky,\n“Dynamically partitioning workflow over federated clouds for\noptimising the monetary cost and handling run-time failures,”\nIEEE Transactions on Cloud Computing, 2016.\n[19] G. Malewicz, M. H. Austern, A. J. Bik, J. C. Dehnert, I. Horn,\nN. Leiser, and G. Czajkowski, “Pregel: a system for large-scale\ngraph processing,” in Proceedings of the 2010 ACM SIGMOD Inter-\nnational Conference on Management of data, 2010, pp. 135–146.\n[20] M. Zaharia, M. Chowdhury, M. J. Franklin, S. Shenker, I. Stoica\net al., “Spark: Cluster computing with working sets.” HotCloud,\nvol. 10, no. 10-10, p. 95, 2010.\n[21] M. Abadi, P . Barham, J. Chen, Z. Chen, A. Davis, J. Dean, M. Devin,\nS. Ghemawat, G. Irving, M. Isard et al., “Tensorflow: A system for\nlarge-scale machine learning,” in 12thfUSENIXg symposium on\noperating systems design and implementation (fOSDIg 16), 2016, pp.\n265–283.\n[22] M. Isard, V . Prabhakaran, J. Currey, U. Wieder, K. Talwar, and\nA. Goldberg, “Quincy: fair scheduling for distributed computing\nclusters,” in Proceedings of the ACM SIGOPS 22nd symposium on\nOperating systems principles, 2009, pp. 261–276.\n[23] N. J. Yadwadkar and W. Choi, “Proactive straggler avoidance\nusing machine learning,” White paper, University of Berkeley, 2012.\n[24] A. Badita, P . Parag, and V . Aggarwal, “Optimal server selection\nfor straggler mitigation,” IEEE/ACM Transactions on Networking ,\nvol. 28, no. 2, pp. 709–721, 2020.\n[25] J. Dean and L. A. Barroso, “The tail at scale,” Communications of the\nACM, vol. 56, no. 2, pp. 74–80, 2013.\n[26] K. Ousterhout, R. Rasti, S. Ratnasamy, S. Shenker, and B.-G. Chun,\n“Making sense of performance in data analytics frameworks,”\nin12thfUSENIXg Symposium on Networked Systems Design and\nImplementation (fNSDIg 15), 2015, pp. 293–307.\n[27] P . Garraghan, X. Ouyang, R. Yang, D. McKee, and J. Xu, “Straggler\nroot-cause and impact analysis for massive-scale virtualized cloud\ndatacenters,” IEEE Transactions on Services Computing, vol. 12, no. 1,\npp. 91–104, 2016.\n[28] X. Ouyang, P . Garraghan, R. Yang, P . Townend, and J. Xu, “Re-\nducing late-timing failure at scale: Straggler root-cause analysis in\ncloud datacenters,” in Fast Abstracts in the 46th Annual IEEE/IFIP\nInternational Conference on Dependable Systems and Networks. DSN,\n2016.\n[29] H. Zhou, Y. Li, H. Yang, J. Jia, and W. Li, “Bigroots: An effective\napproach for root-cause analysis of stragglers in big data system,”\nIEEE Access, vol. 6, pp. 41 966–41 977, 2018.\n[30] S. Lu, X. Wei, B. Rao, B. Tak, L. Wang, and L. Wang, “Ladra:\nLog-based abnormal task detection and root-cause analysis in big\ndata processing with spark,” Future Generation Computer Systems,\nvol. 95, pp. 392–403, 2019.\n[31] Z. He, Y. He, F. Liu, and Y. Zhao, “Big data-oriented product infant\nfailure intelligent root cause identification using associated tree\nand fuzzy dea,” IEEE Access, vol. 7, pp. 34 687–34 698, 2019.\n[32] H. Du and S. Zhang, “Hawkeye: Adaptive straggler identification\non heterogeneous spark cluster with reinforcement learning,”\nIEEE Access, vol. 8, pp. 57 822–57 832, 2020.\n[33] J. P . Magalh ˜aes and L. M. Silva, “Root-cause analysis of perfor-\nmance anomalies in web-based applications,” in Proceedings of the\n2011 ACM Symposium on Applied Computing, 2011, pp. 209–216.\n[34] R. Bitar, M. Wootters, and S. El Rouayheb, “Stochastic gradient\ncoding for straggler mitigation in distributed learning,” IEEE\nJournal on Selected Areas in Information Theory, vol. 1, no. 1, pp.\n277–291, 2020.\n[35] A. M. Chacko, J. S. Medicherla, and S. M. Kumar, “Anomaly\ndetection in mapreduce using transformation provenance,” in\nAdvances in Big Data and Cloud Computing. Springer, 2018, pp.\n91–99.\n[36] N. Khoussainova, M. Balazinska, and D. Suciu, “Perfx-\nplain: debugging mapreduce job performance,” arXiv preprint\narXiv:1203.6400, 2012.\n[37] M. Du, F. Li, G. Zheng, and V . Srikumar, “Deeplog: Anomaly\ndetection and diagnosis from system logs through deep learning,”\ninProceedings of the 2017 ACM SIGSAC Conference on Computer and\nCommunications Security, 2017, pp. 1285–1298.\n[38] A. Agelastos, B. Allan, J. Brandt, P . Cassella, J. Enos, J. Fullop,\nA. Gentile, S. Monk, N. Naksinehaboon, J. Ogden et al., “The\nlightweight distributed metric service: a scalable infrastructure\nfor continuous monitoring of large scale computing systems and\napplications,” in SC’14: Proceedings of the International Conferencefor High Performance Computing, Networking, Storage and Analysis.\nIEEE, 2014, pp. 154–165.\n[39] A. Netti, M. M ¨uller, C. Guillen, M. Ott, D. Tafani, G. Ozer, and\nM. Schulz, “Dcdb wintermute: Enabling online and holistic op-\nerational data analytics on hpc systems,” in Proceedings of the 29th\nInternational Symposium on High-Performance Parallel and Distributed\nComputing, 2020, pp. 101–112.\n[40] A. Borghesi, A. Bartolini, M. Lombardi, M. Milano, and L. Benini,\n“Anomaly detection using autoencoders in high performance\ncomputing systems,” in Proceedings of the AAAI Conference on\nArtificial Intelligence, vol. 33, 2019, pp. 9428–9433.\n[41] Nagios. Accessed: 2020-07-15. [Online]. Available: https://www.\nnagios.org/\n[42] Ganglia. Accessed: 2020-07-15. [Online]. Available: http://ganglia.\ninfo/\n[43] Apache chukwa. Accessed: 2020-07-14. [Online]. Available:\nhttps://chukwa.apache.org/\n[44] Dmon. Accessed: 2020-07-12. [Online]. Available: https://github.\ncom/Open-Monitor/dmon\n[45] U. Demirbaga, A. Noor, Z. Wen, P . James, K. Mitra, and R. Ranjan,\n“Smartmonit: Real-time big data monitoring system,” in 2019 38th\nSymposium on Reliable Distributed Systems (SRDS). IEEE, 2019, pp.\n357–3572.\nUmit Demirbaga (Member, IEEE) is a PhD stu-\ndent in the School of Computing, Newcastle\nUniversity, UK. He received an MSc degree in\nComputer Science from Newcastle University,\nUK in 2017 and the BSc degree in Electronics\nand Computer Education from Marmara Univer-\nsity, Turkey in 2011. His research interests in-\nclude big data analytics, cloud computing and\ndistributed systems. He was awarded Outstand-\ning Performance Award with Best Team Project\nAward in his MSc in 2017.\nZhenyu Wen (Member, IEEE) received MSc and\nPhD degrees in Computer Science from New-\ncastle University, Newcastle upon Tyne, UK, in\n2011 and 2016, respectively. He is currently a\nPostdoc Researcher with the School of Com-\nputing, Newcastle University, UK. His current re-\nsearch interests include IoT, crowd sources, AI\nsystem, and cloud computing. For his contribu-\ntions to the area of scalable data management\nfor the Internet of Things. He was awarded the\nIEEE TCSC Award for Excellence in Scalable\nComputing (Early Career Researchers) in 2020.\nAyman Noor is a PhD student in Computer\nScience at Newcastle University, UK. His cur-\nrent research interests include cloud computing,\nmonitoring, and machine learning. He earned a\nMaster of Science in Computer and Information\nScience from Gannon University, PA, USA in\n2013 and a Bachelor in Computer Science from\nthe College of Computer Science and Engineer-\ning from Taibah University, Madinah, SA in 2006.\nKaran Mitra is an Assistant Professor at Lule ˚a\nUniversity of Technology, Sweden. He received\nhis Dual-badge PhD from Monash University,\nAustralia and Lule ˚a University of Technology in\n2013. His research interests include cloud and\nmobile cloud computing, performance bench-\nmarking of distributed systems, context-aware\ncomputing and QoE. He is a member of the IEEE\nand ACM.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. \n0018-9340 (c) 2021 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission. See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication. Citation information: DOI 10.1109/TC.2021.3070639, IEEE\nTransactions on Computers\n14\nKhaled Alwasel has a BS and MS in informa-\ntion technology from Indiana University-Purdue\nUniversity Indianapolis (2014) and Florida Inter-\nnational University (2015), USA. He is currently\nworking toward a PhD in the School of Com-\nputing Science at Newcastle University (UK).\nKhaled’s interests lie in the areas of software-\ndefined networking (SDN), big data, IoT, edge\ncomputing, and cloud computing\nSaurabh Garg is a lecturer at the University of\nTasmania, Hobart, Tasmania. He has published\nmore than 30 papers in highly cited journals\nand conferences with H-index 24. He has gained\nabout three years of experience in industrial re-\nsearch while working at IBM Research Australia\nand India. His areas of interest are distributed\ncomputing, cloud computing, HPC, IoT, big data\nanalytics, and education analytics.\nAlbert Y. Zomaya is currently the Chair Pro-\nfessor of High Performance Computing & Net-\nworking in the School of Computer Science,\nUniversity of Sydney. He is also the Director of\nthe Centre for Distributed and High Performance\nComputing which was established in late 2009.\nProfessor Zomaya was an Australian Research\nCouncil Professorial Fellow during 2010-2014\nand held the CISCO Systems Chair Professor\nof Internetworking during the period 2002–2007\nand also was Head of School for 2006–2007.\nRajiv Ranjan is a Full professor in Comput-\ning Science at Newcastle University, UK. Before\nmoving to Newcastle University, he was Julius\nFellow (2013-2015), Senior Research Scientist\nand Project Leader in the Digital Productivity and\nServices Flagship of Commonwealth Scientific\nand Industrial Research Organization (CSIRO\nC Australian Government’s Premier Research\nAgency). Prior to that he was a Senior Research\nAssociate (Lecturer level B) in the School of\nComputer Science and Engineering, University\nof New South Wales (UNSW). Dr Ranjan has a PhD (2009) from\nthe department of Computer Science and Software Engineering, the\nUniversity of Melbourne.\nAuthorized licensed use limited to: San Francisco State Univ. Downloaded on June 23,2021 at 08:53:22 UTC from IEEE Xplore. Restrictions apply. ",
"metadata": {
"filename": "demirbaga2022.pdf",
- "file_path": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\RSL-Daase2024\\demirbaga2022.pdf",
- "file_size": 3420259,
- "file_type": ".pdf",
- "imported_at": "2025-12-17T21:23:37.106306",
- "content_length": 76788
- }
+ "filepath": "C:\\Users\\Icaro\\Documents\\projetos-google-cli\\data-quality-chatbot\\docs_to_import\\rsl_daase2024\\demirbaga2022.pdf",
+ "size": 3420259,
+ "source": "docs_to_import"
+ },
+ "id": "4eee3406-0542-45ea-afdc-870a7ac4dd41"
},
- "9ae75c26-80ce-494f-af6d-7db9503ae926": {
- "id": "9ae75c26-80ce-494f-af6d-7db9503ae926",
- "content": "[Página 1]\nBigBench: TowardsanIndustryStandardBenchmarkfor\nBi\ngDataAnalytics\nAhmadGhazal1,5,TilmannRabl2,6,MinqingHu1,5,\nFrancois Raab4,8,MeikelPoess3,7,AlainCrolotte1,5,Hans-Arno Jacobsen2,9\n1TeradataCorp.,2UniversityofToronto,3OracleCorp.,4InfoSizing,Inc.\n5{ahmad.ghazal,minqing.hu,alain.crolotte}@teradata.com,6tilmann@msrg.utoronto.ca\n7meikel.poess@oracle.com,8francois@sizing.com,9jacobsen@eecg.toronto.edu\nABSTRACT\nThere is a tremendous interest in big data by academia,\nindustryanda large user base. Several commercial andopen\nsource providers unleashed a variety of products to support\nbig data storage and processing. As these products mature,\nthere is a need to evaluate and compare the performance of\nthese systems.\nIn this paper, we present BigBench, an end-to-end big\ndata benchmark proposal. The underlying business model\nof BigBench is a product retailer. The proposal covers a\ndata model and synthetic data generator that addresses the\nvariety, velocity and volume aspects of big data systems con-\ntaining structured, semi-structured and unstructured data.\nThe structured part of the BigBench data model is adopted\nfrom the TPC-DS benchmark, which is enriched with semi-\nstructured and unstructured data components. The semi-\nstructured part captures registered and guest user clicks\non the retailer’s website. The unstructured data captures\nproduct reviews submitted online. The data generator de-\nsigned for BigBench provides scalable volumes of raw data\nbased on a scale factor. The BigBench workload is designed\naround a set of queries against the data model. From a busi-\nness prospective, the queries cover the different categories of\nbig data analytics proposed by McKinsey. From a technical\nprospective, the queries are designed to span three different\ndimensions based on data sources, query processing types\nand analytic techniques.\nWe illustrate the feasibility of BigBench by implement-\ning it on the Teradata Aster Database. The test includes\ngenerating and loading a 200 Gigabyte BigBench data set\nand testing the workload by executing the BigBench queries\n(written using Teradata Aster SQL-MR)and reporting their\nresponse times.\nCategoriesandSubjectDescriptors\nD.2.8[Software Engineering ]: Metrics— performance mea-\nsures\nPermission to make digital or hard copies of all or part of this work for\npersonal or classroom use is granted without fee provided that copies are\nnot madeor distributed for proftor commercial advantage and that copies\nbearthisnoticeandthefullcitation onthefrstpage. Tocopyotherwise,to\nrepublish,topostonserversortoredistributetolists,requirespriorspecifc\npermissionand/or afee.\nSIGMOD’13, June22–27,2013,NewYork,NewYork,USA.\nCopyright 2013ACM978-1-4503-2037-5/13/06 ...$15.00.Keywords\nBenchmarking; big data; map reduce\n1. INTRODUCTION\nToday’s data explosion, fueled by emerging applications,\nsuch as social networking, micro blogs, and the“crowd intel-\nligence”capabilities of many sites, has led to the“big data”\nphenomenon. It is characterized by increasing volumes of\ndata of disparate types (i.e., structured, semi-structuredand\nunstructured)from sources that generate new data at a high\nrate (e.g., click streams captured in web server logs). This\nwealth of data provides numerous new analytic and business\nintelligence opportunitieslike fraud detection, customer pro-\nfiling, and churn and customer loyalty analysis.\nConsequently, there is tremendous interest in academia\nand industry to address the challenges in storing, access-\ning and analyzing this data. Several commercial and open\nsource providers already unleashed a variety of products to\nsupport big data storage and processing. These tools are\nmostly parallel database management systems (e.g., Green-\nplum[4], Netezza’s TwinFin[9], Teradata[8], Oracle[6]) or\nMapReduce (MR) based systems (e.g., Hadoop [1], Cloud-\nera’s CDH [3], Hive[2] and many other systems like those in\n[15, 17, 24, 27]).\nAs big data systems mature, the pressure to evaluate and\ncompare performance and price performance of these sys-\ntems rises. However, to date there are no standard bench-\nmarks available. This takes us back to the middle of the\n1980’s, when the lack of standard database benchmarks led\nmanydatabasemanagementsystemvendorstopracticewhat\nis now referred to as“benchmarketing”– a practice in which\norganizations makeperformanceclaims basedonself-defined,\nhighly biased benchmarks. The goal of publishing results\nfromsuchtailoredbenchmarkswastostatemarketingclaims,\nregardless of the absence of relevant and verifiable technical\nmerit. In essence, these benchmarks were designed as for-\ngone conclusions to fit a pre-established marketing message.\nSimilarly, vendors would create configurations, referred to\nas “benchmark specials”, that were specifically designed to\nmaximize performance against a specific benchmark with\nlimited benefit to real-world applications.\nTowards the end of the 1980’s, as a response to this grow-\ning practice, benchmark consortia such as the Transaction\nProcessing Performance Council (TPC) and the Standard\nPerformance Corporation (SPEC) were founded. Influenced\nbyacademic databaseexpertsandwell-known industrylead-\n1197\n\n[Página 2]\nUnstructured \nDa\nta \nSemi-Structured Data Structured Data \nSales \nCustomer \nItem \n Marketprice \nWeb Page \nWeb Log \nReviews \nAdapted \nTP\nC-DS \nBigBench \nSpecific \nFigure 1: Big Data Benchmark Data Model\ner\ns, industry standard benchmarks such as TPC-A, TPC-C\nand TPC-D were engineered and rules around publishing\nresults were agreed upon.\nRecently a few efforts in the area of big data benchmarks\nemerged, such as YCSB[16], PigMix[7], GridMix [5] and\nGraySort [20]. These efforts are island solutions and not\npoliced by any industry consortia. While some are focused\non one or a subset of components and tasks typical for big\ndata systems, others are based on specific map-reduce-style\nsystems.\nWebelieveanindustrystandardbigdatabenchmarkmust\nbe an end-to-end benchmark covering all major characteris-\ntics in the lifecycle of a big data system including the three\nVs described by Douglas Laney[21]: (i) volume(larger data\nset sizes), (ii) velocity (higher data arrival rates, such as\nclick streams) and (iii) variety(increased data type dispar-\nity, such as structured data from relational tables, semi-\nstructured data from key-value web clicks and un-structured\ndata from social media content).\nIn this paper, we present our proposal for an end-to-end\nbig data benchmark. After a presentation of initial ideas for\nthe benchmark at the first Workshop on Big Data Bench-\nmarking1a group formed that collaborated on building the\nspecification. We call it “BigBench”. It is based on a ficti-\ntious retailer who sells products to customers via physical\nand online stores. The proposal covers a data model, syn-\nthetic data generator and workload description. The work-\nload queries are specified in English, since no clear standard\nfor big data systems has yet emerged. We also suggest di-\nrections for big data metrics specific to data loading and\nworkload execution. Furthermore, the feasibility of the pro-\nposal is validated by implementing it on the Teradata Aster\nDBMS (TAD). This experiment involves generating 200 Gi-\ngabyte of raw data and loading it into TAD. The English\nlike workload queries are implemented using TAD’s SQL-\nMR syntax and executed as a single stream of queries.\nThefirstmajor componentofBigBenchisthespecification\nof a data model that focuses on volume, variety and velocity.\nThe variety property of our model is illustrated in Figure\n1. The structured part of BigBench is adapted from the\nTPC-DS data model, which also depicts a product retailer\n[23]. We borrowed the store and online sales portion from\nthat model and added a table for prices from the retailer’s\ncompetitors.\nThe structured part is enriched with semi-structured and\n1WBDB, May 2012, San Jose – http://clds.ucsd.edu/\nwbdb2012un-structured data shown in the lower and right hand side\nof Figure 1. The semi-structured part is composed by clicks\nmade by customers and guest users visiting the retailer’s\nweb site. Our design assumes the semi-structured data to\nbe in a key-value format similar to Apache’s web server log\nformat. The un-structured data in our model is covered by\nproduct reviews that can be submitted by guest users or\nactual customers.\nWe also provide the design and implementation of a data\ngenerator for the proposed BigBench data model. Our data\ngenerator is based on an extension of PDGF [29]. PDGF is\na parallel data generator that is capable of producing large\namounts data for an arbitrary schema. The existing PDGF\ncan be used to generate the structured part of the BigBench\nmodel. However, it is not capable of producing neither the\nsemi-structured web clicks nor the unstructured product re-\nviewstext. Partofourcontributioninthispaperistoextend\nPDGF to coverthesemi-structuredandun-structuredparts.\nWe enhanced PDGF to produce a key-value data set for a\nfixed set of required and optional keys. This is sufficient to\ngenerate the web logs part of BigBench.\nThe main challenge in generating product reviews is to\nproduceun-structuredtext. Wedevelopedandimplemented\nan algorithm that produces synthetic text based on some\nsample inputtext. The algorithm usesa MarkovChain tech-\nnique that extracts key words and builds a dictionary based\non these key words. The new algorithm, called TextGen, is\napplied or our retailer model by using some real product re-\nviews from amazon.com for the initial sample data. PDGF\ninteracts with TextGen through an API sending product\ncategory as input and getting a product review text for that\ncategory.\nThe volumedimension of ourmodel is far simpler thanthe\nvariety discussion and previous data generators had a good\nhandle on that. PDGF handles the volume well since it can\nscale the size of the data based on a scale factor. It also\nruns efficiently for large scale factors since it runs in parallel\nand can leverage large systems dedicated for the benchmark.\nWe also address big data velocity by establishing a periodic\nrefresh scheme that constantly adds data to the different\nareas of the data model.\nThe second major component of BigBench is the speci-\nfication of workload queries applied on the BigBench data\nmodel. In terms of business questions, we found that the\nbig data retail analytics by McKinsey [22] serves our pur-\npose given that BigBench is about retail. In [22] five major\nareas, or business levers, of big data analytics are identified:\nmarketing, merchandising, operations, supplychainandnew\nbusiness models.\nIn addition to the big data retail business levers above,\nwe looked at three different technical dimensions the Big-\nBench queries should span. The first technical dimension is\nabout the type of data used in queries. This implies mak-\ning sure that structured types, semi-structured types, un-\nstructured types and their combinations are each covered\nin the queries. The second technical dimension covers the\ntwo common paradigms of declarative processing (SQL and\nsimilar constructs like HQL) and procedural MR processing.\nTo that end, some queries are best suited to be declarative,\nothers to be procedural and others to be a mix of both. The\nthirdtechnical dimensionis aboutthedifferentalgorithms of\nanalytic processing as described by the Apache MAHOUT\nsystem. Examples of these algorithms are classifications,\n1198\n\n[Página 3]\npattern matching, clustering, regression, dimensional redu c-\ntion, etc.\nIn summary, our key contributions are as follows:\n1. Wepresentthe firstend-to-endbenchmarkfor big data\nanalytics while previous work focused on few selected\ntypes of data or processing. BigBench implements the\ncomplete use-case of a realistic retail business.\n2. We specify 30 queries that cover all important aspects\nof big data analytics. The queries are specified in En-\nglish as well as TAD’s SQL-MR syntax.\n3. We develop and implement a novel technique for pro-\nducing un-structured text data and integrate it with a\ntraditional structured data generator.\n4. We conduct a proof of concept implementation and\nevaluation of BigBench by executing the benchmark\non the Teradata Aster DBMS.\nThe remainder of this paper is organized as follows. Sec-\ntion 2 covers previous work related to big data benchmark-\ning. Section 3 gives a detailed description of the BigBench\nbenchmark. The data model and data generation are de-\nscribed in detail in Sections 3.1 and 3.2. We describe the\nworkload queries in Section 3.3 and the benchmark metrics\nin Section 3.4. We present our proof of concept implemen-\ntation of BigBench using TAD in Section 4 including results\ninvolving 200 Gigabyte database. Finally, Section 5 summa-\nrizes the paper and suggests future directions.\n2. RELATEDWORK\nThe requirement for well defined benchmarks that mea-\nsuretheperformanceofDBMSdealingwithverylarge amounts\nof data emerged when the first generation of commercial\nsystems appeared in the 1980’s by Teradata Corporation\nand other more traditional DBMS vendors, who followed.\nDriven by vendor’s needs to compare commercial systems,\nthe Transaction Processing Performance Council developed\na series of data warehouse end-to-end benchmarks starting\nwith TPC-D in the beginning of the 90’s and TPC-H and\nTPC-R in the dawn of 2000 (all specifications available from\nthe TPC website2). These benchmarks, restricted to ter-\nabyte data sizes, emphasized single and multi-user perfor-\nmance of complex SQL query processing capabilities with\nsome updates on an enterprise data warehouse. Even ear-\nlier, academia started developing micro benchmarks such as\nthe Wisconsin benchmark, the OO7 [12] and BUCKY [13]\nbenchmarks for object-oriented DBMSs, XMark [31] and\nEXRT [14] benchmarks for XML-related DBMS technolo-\ngies.\nAs data volumes grew from megabytes of data and simple\ndata models (small number of tables with few relationships)\nover time to petabytesandcomplex data models (large num-\nber of tables with many complex relationships) the TPC\nresponded with the development of its next generation deci-\nsion support benchmark, TPC-DS [23], in the early 2000’s.\nStill based on the SQL programming language it contains\nmany big data elements, such as very large data and system\nsizes. Although the current limit is 100 terabyte, the data\ngenerator and schema can be extended to petabytes. It also\n2TPC -http://www.tpc.orgcontains very complex analytical queries using sophisticated\nSQL structures and a concurrent update model.\nIn parallel, academia as well as emerging big data com-\npanies have started defining the next generation big data\nbenchmarks, which are mostly component and micro bench-\nmarks. Yahoo! developed its cloud serving benchmark,\nYCSB, to evaluate NoSQL data stores [16]. It is a flexi-\nble multiuser benchmark with two tiers, a performance tier\n(testing latency) and a scalability tier. In the original paper,\nthree workloads were runagainst four different data stores:\nHBase, Cassandra, PNUTs, and MySQL. Other evaluations\nfollowed that extended the scope of YCSB [30, 25]. The\nCALDA effort [26] defined a micro-benchmark for big data\nanalytics based on Google’s MapReduce paper and com-\npared Hadoop with two RDBMS systems, one that is row\nand one that is column organized. Another widely used\nbenchmark is the TeraSort or GraySort benchmark [20],\nwhich can be considered a micro benchmark that sorts a\nlarge number of 100-byte records doing considerable amount\nof computation, networking, and storage I/O. Other bench-\nmarks are the GridMix [5] and PigMix [7].\nTPC-DS [23, 28] is TPC’s latest decision support bench-\nmark. It covers the major three disciplines in the life-cycle\nof a relational decision support benchmark, namely (i) load-\ning the initial database (ii) executing queries in both single-\nand multi-user modes (iii) refreshing the database. TPC-DS\nhandles some aspects of big data like volume and some as-\npects of velocity. Still, it lacks key components of big data\nlike semi-structured and unstructured data and their asso-\nciated analytics.\nIn summary, previous benchmarks described in this sec-\ntion are mostly micro and component benchmarks. Others\nlike TPC-DS lack key big data characteristics. This brings\na need for an end-to-end benchmark for big data processing.\n3. BIGDATABENCHMARK\nThis section covers the major parts of the BigBench speci-\nfication. Due to space restrictions, not all details can be pre-\nsented here. Additional details can be found in an extended\nversion of this paper, to be made available at publication\ntime.\n3.1 DataModel\nThe three cornerstone aspects of big data systems are vol-\nume,variety,velocity. Big data systems need to be able\nto deal with large volumes of data, sometimes in the mul-\ntiple petabyte range. We deal with the volume aspect in\nthe following section about data scaling. Variety refers to\nthe ability to deal with differently organized data, from un-\nstructured to semi-structured and structured data. The fol-\nlowing section about variety lays out a structure that cov-\ners all the types of data integrated in one model. Velocity\nrefers to the ability of a big data system to stay current\nthrough periodic refreshes, commonly referred to as extrac-\ntion,transformation andload(ETL). A big data system is\nnot a one-time snapshot of a business operations database\nnor is it a database where OLTP applications are running\nconcurrently. Hence, staying current with the operational\nside is a very important aspect of analytical systems, and\neven more so in the context of a big data system.\nIn the following subsection, we develop the data model\nshowing how the 3 Vs in big data are addressed in Big-\nBench. We show how volumeis addressed by using scale\n1199\n\n[Página 4]\n127.0.0.1 - - [Jun/23/2003:05:59:23 +0200]\n\"G\nET/page33.html?wcs_click_date=2452814\n&wcs_click_ time=21563&wcs_user_id=95789\n&wcs_web_page_sk=32&wcs_item_sk=28 HTTP/1.1\" 200 2256\n\"http://www.someurl.org\" \"Mozilla/5.0\"\nFigure 2: Example of a web log entry\nfactors in the data generators to scale data up to petabytes\nof data, how varietyis addressed through the usage of data\nfrom many sources and how velocityis achieved by periodic\nrefreshes of the data repository.\n3.1.1 Variety\nThe general benchmark data model is summarized in Fig-\nure 1, which shows the three data components of the bench-\nmark namely structured data, semi-structured data and un-\nstructureddatatogetherwiththerelationshipsbetweenthem.\nThe structured component of BigBench is adapted from\nthe TPC-DS benchmarkrecently publishedbytheTPC [10].\nA description of this benchmark can be found in [23, 28].\nBigBench is however not a simple extension of TPC-DS.\nInstead, BigBench focuses chieflyon the analytics associated\nwith semi-structured and unstructured data.\nWith a few exceptions most of the tables contained in\nTPC-DS are used by BigBench; the main focus being store\nand web sales, which only contain structured data. These\ntables cover data relating to the purchases made in stores\nand over the web, but also related tables such as itemde-\nscribing the items offered by the retailer, customer and its\nancillary tables containing all relevant client data, webpage\nan\ndwebsitedescribing pages and web sites used by on-\nl\nine clients and all associated dimension tables. To better\nsupport our functional design, we also added a new table\ncalleditemmarketprices to the structured data. It contains\ncompetitor names and prices for each item so that price\ncomparisons performed by online users who are interested\nin particular items could also be captured.\nThe semi-structured data focuses on click-streams, con-\ntained in web log files. While some of the clicks result in\nsales thereby necessitating a link to structured area tables\ncontaining online sales, item, web pages, customer and asso-\nciated dimensions, the large majority of these clicks are as-\nsociated with browsing activity not resulting in sales. These\nclicks focus on items and are associated with registered users\nor guests.The format retained for the clicks is that of Apache\nlogs. A typical entry of such a log associated with a regis-\ntered user could look like the example in Figure 2.\nWeb logs can be processed either directly at run time (late\nbinding) or parsed and stored into a structured table/file.\nSince all values are surrogate keys referring to the struc-\ntured schema, the above record once processed could look\nlike Table 1.\nThe unstructured data resembles written text associated\nwith product reviews of items offered by the retailer. Such\nreviews could be from several sources, namely guest users,\nregistered users withapurchaseandregistered userswithout\na purchase. This implies a relationship between reviews and\nstructred data like customer, sales and item tables. The\nreviews and its relationship with the structred data can be\ncaptured bya table/file. The table/file capturesthe primary\nkeys of the referenced tables. The review itself is containedin a large variable character field containing free form text,\nthe rating score and the date and time of the review are also\ncontained in the table/file.\n3.1.2 Volume\nThe size of the structured area is based on the size of the\ntables involved, using a well-understood and known quan-\ntity similar to the scale factor in TPC-DS. The size of the\nsemi-structured and unstructured areas are also based on\nthis scale factor. Consequently, the size of the complete\nBigBench data set is based on a single scale factor and is\npredictable and deterministic at any volume.\nFor the item marketprice table, it is assumed that an av-\ner\nage of 5 competitor prices are stored for each item. Thus,\nthe sizing of item marketprice is |it em|×5.\nThe size of web logs dependson the number of clicks made\nby buyers (making entries in web sales) and visitors who do\nno\nt endupbuying. Each rowin web sales represents a single\nli\nne item, thus the number of clicks per sale is comprised of\nthe number of clicks per item and the number of clicks to\nmake a sale (i.e. login, go to cart, checkout). The number\nof clicks for buyers cbcan be specified with the following\nequation:\ncb=|websales|×( pages per item+pages per buy\nitems per sale)\nAs\nsumingbothpages per itemandpages per buytobe equal\nto 4 on average and setting the avergae value of items per\nsale to be 12 (from TPC-DS), the value of cbis simplified to\ncb=|websales|×4.3 3\nWe assume that 80% of surfers are visitors (20% buyers)\nwhich makes the ratio of visitors to buyers to be 4:1. We\nalso assume that on average visitors browse items the same\nway as buyers. Based on these assumptions, the formula for\nthe number of clicks for visitors cvis:\ncv= (|websales|×p ages per item) ×visitor ratio\ncv=|websales|×1 6\nOverall, the size of the web log is cb+cvand can be ex-\npressedasamultipleofthesizeofweb sales. Itisweb sales×\n20.3\n3. The web sales table scales linearly with the scale fac-\nto\nr, the size for scale factor 1 is 720K, thus the number of\nentries for the web log at scale factor 1 is 14,600K. Given to\nthe log format, the raw file size is 3 gigabyte.\nFor the review sizing, a similar approach is chosen. Three\nsources for reviews are considered: anonymous reviews, ran-\ndom item reviews by registered users (customers), and re-\nviews based on sales. The number of anonymous reviews is\nrelated to the number of items, an average of 5 anonymous\nTable 1: Representation of a web log entry\nField Name Value\nwcsclicksk 996146\nwcsclickdatesk2452814\nwcsclicktimesk21563\nwcsitemsk 28\nwcswebpagesk 32\nwcsusersk 95789\n1200\n\n[Página 5]\nreviews per item is assumed. The number of reviews by\nre\ngistered users is dependent on the number of users in the\nsystem. Because not all users are actually writing reviews,\nan average of one review per 5 users is assumed. Finally,\na certain amount of the sales will directly lead to a review.\nThis amount is set to 15%. The number of reviews can be\ncomputed by the following formula:\n|reviews|=|items|×5+|customers |×0.2+|websales|×0.1 5\n3.1.3 Velocity\nVelocity, i.e. a periodic data refresh process, is an inte-\ngral part of the life cycle of a big data system. A production\ndata refresh process consists of three steps: (i) data extract\n(ii)datatransformation, and(iii)dataload. Inaproduction\nsystem environment, the data extraction step may consist of\nnumerousseparateextractoperations, executedagainst mul-\ntiple operational systems and ancillary data sources. As it\nis unlikely that the full list of these operational data sources\nresides on the system running the big data application, it\nis doubtful the measurement of the data extraction perfor-\nmance would result in a metric appropriate or meaningful\nto the scope of this benchmark. In light of this, the data\nextract step is assumed and represented in the benchmark\nin the form of generated files.\nThere are two aspects to discuss in a periodic refresh\nmodel for the tables in BigBench: (i) amount of data to\ninclude in the refresh process and (ii) the time interval at\nwhich the refresh occurs. Both aspects apply to the struc-\ntured(websales channeland itemmarketprice ta bles), semi-\nstructured ( clickstream) and un-structured data ( p roduct\nreview).\nW\ne implement BigBench’s periodic refresh process based\non the well studied methodology for data maintenance of\nTPC-DS. It defines the insertion of new data and the dele-\ntion of old data from all fact tables as well as insert and\nupdated data of dimensions. Dimensions are divided into\nthree sets, history keeping, non-history keeping and static\ndimensions. Static dimensions, such as date and time are\nnot updated. History keeping dimensions never overwrite\nany data, but they keep a history of all former changes.\nNon-History keeping dimensions resemble almost a one-to-\none copy of the table in the operational system of the busi-\nness, i.e. they update existing data. Both, history keeping\nand non-history keeping dimensions, accept new data and\nnever delete any old data. According to the above defini-\ntions,clickstreamandp roductrevieware fact tables and\ni\ntemmarketprice is a history keeping table. Pseudo code\nfor the insertion, deletion of fact table data as well as insert\nand update operations for the dimension tables can be found\nin [23] and the official TPC-DS specification3.\nOne of the fundamental aspects of the above methodol-\nogy is the concurrent execution of the refresh process with\nthe query workload. Queries must be interspersed with in-\nsert, delete and update operations. In BigBench we run\nN concurrent query streams containing queries against the\nstructured, semi-structuredandunstructuredportionsofthe\nschema. The numberof refresh processes executedis a linear\nfunction of the number of query streams, S. In real systems,\ndata against the different data portions is updated with dif-\nferent frequencies. Hence we define a vector V with the\nfollowing three separate data refresh velocities for each of\n3TPC –http://www.tpc.org/g89/g437/g286/g396/g349/g286/g400/g3/g94/g410/g396/g286/g258/g373/g3/g1005/g1007/g910\n/g94/g3/g395/g437/g286/g396/g349/g286/g400 /g89/g282/g258/g410/g258/g410/g455/g393/g286/g400/g882/g1010/g3/g395/g437/g286/g396/g349/g286/g400 /g89/g282/g258/g410/g258/g410/g455/g393/g286/g400/g882/g1010/g3/g395/g437/g286/g396/g349/g286/g400\n/g89/g437/g286/g396/g349/g286/g400/g3/g94/g410/g396/g286/g258/g373/g3/g1006\n/g89/g437/g286/g396/g349/g286/g400/g3/g94/g410/g396/g286/g258/g373/g3/g94/g882/g1005\n/g89/g437/g286/g396/g349/g286/g400/g3/g94/g410/g396/g286/g258/g373/g3/g94/g857\n/g89/g437/g286/g396/g349/g286/g400/g3/g94/g410/g396/g286/g258/g373/g3/g94\n/g90/g286/g296/g396/g286/g400/g346/g3/g87/g396/g381/g272/g286/g400/g400/g3/g1005 /g90/g286/g296/g396/g286/g400/g346/g3/g87/g396/g381/g272/g286/g400/g400/g3/g1006 /g90/g286/g296/g396/g286/g400/g346/g3/g87/g396/g381/g272/g286/g400/g400/g3/g1007 /g90/g286/g296/g396/g286/g400/g346/g3/g87/g396/g381/g272/g286/g400/g400/g3/g94/g876/g1006\nFigure 3: Scheduling of refresh processes based on\nex\necuted queries per data type\nthe different data portions, V= (Vstructured ,Vsemistructured\nandVunstructured ). We suggest the following values for V,\nwhich are subject to change as we run more experiments.\nThe structured data being the least frequently updated por-\ntion of the schema has a velocity of Vstructured = 1, i.e. S\nrefresh process. The unstructured data gets a velocity of\nVunstructured = 2∗Vstructured , i.e. 2∗Srefresh process, and\nthe semi-structured data being the most frequently updated\nportion gets a velocity of Vsemistructured = 2∗Vunstructured ,\ni.e. 4∗Srefresh process. The total number of refresh pro-\ncesses is 7 ∗S.\nDuring a BigBench run the following two requirements\nguarantee that the queries are interspersed with the queries\n(Sis the total number of query streams and Qdatatypeis\nthe total number of queries against the three portions of the\nschema ):\n1. The Nthrefresh set canonly startafter [((3 ∗S)+((N−\n1)∗2∗Qdatatype)] queries have completed (aggregated\nover all streams), and\n2. The [(3 ∗S)+(N∗(Qdatatype−6))+1]th query (ag-\ngregated over all streams) can only start after the Nth\nrefresh set has completed.\nThis means that at least (3 ∗S) queries must complete be-\nfore the first refresh set can start and at least Qdatatype−6\nadditional queries must complete before the second refresh\nsetcan start. Ingeneral atleast (3 ∗S)+((N−1)∗Qdatatype−\n6)) queries must complete before the Nth refresh set can\nstart. Figure 3 shows how the refresh processes are sched-\nuled depending on the number of executed queries.\nAll three type of data tables follow the well-understood\nscale factors of TPC-DS as outlined in the previous section.\nThat is the amount of data to be inserted in each ETL op-\neration is a percentage of the initial load, e.g. 0.1%.\n3.2 DataGeneration\nOur data generation design is based on an existing tech-\nnology called Parallel Data Generation Framework (PDGF).\nPDGF was designed to address structured data. Part of the\nwork presented in this paper is to extend the framework\nto produce the semi-structured and unstructured data. The\nsemi-structured data is generated in form of weblogs and the\nunstructured data in form of item reviews. In the following\nsection, we give an overview of PDGF and then elaborate\non its extensions for semi-structured and unstructured data.\n1201\n\n[Página 6]\n3.2.1 PDGF\nPD\nGF is a generic, parallel data generator which was de-\nveloped at the Universityof Passau [18, 29]. PDGF is imple-\nmented in Java and fully platform independent. Currently,\nPDGF is used to implement the default data generator for\nthe TPC’s new ETL benchmark TPC-DI [33]. PDGF’s gen-\neration approachexploits theinherentparallelism of xorshift\nrandom number generators by using a novel seeding strat-\negy. The seeding strategy hierarchically assigns seeds to the\ntables, columns and rows of a database schema and thus\nmakes it possible to generate data completely in parallel as\nwell as re-calculate any value in the database without ac-\ncessing the original data.\nOriginally, PDGF is designed to generate relational data.\nThe data is specified in two XML documents, the schema\nconfigurationandthegenerationconfiguration. Asthename\nsuggests, the schema configuration specifies the data simi-\nlar to the definition of a relational schema. The generation\nconfiguration makes it possible to specify additional post-\nprocessing of the generation. The post-processing includes\nformatting data, merging and splitting tables, as well as ad-\nvanced procedures by providing a script like programming\ninterface using the Javassist4library.\nPDGF can be used as isto generate the structured parts\nof the data model. As discussed above, the current Big-\nBench schema comprises three additional entities on top\nof the TPC-DS schema: the Item marketprice table, an\nap\nache-style web server log, and the online reviews. The\nItemmarketprice table is a regular table and can easily be\nge\nnerated using PDGF. In Listing 1, an excerpt of the spec-\nification of Item marketprice can be seen. The table is de-\nfin\ned in a way similar to the SQL definition language, with\nan additional specification of the generation rules. The sur-\nrogate key (imp sk) is, for example, generated with a ID\nge\nnerator. PDGF supports more complex generation spec-\nifications as can be seen in the case of the imp competitor\nfie\nld, this field is generated as a random string that is null\nwith a probability of 0.025%.\n\n${\nitem}*${avg_competitors_per_item}\n \n\n${Item_marketprice}\n\n \n \n[..]\n\n\n0.00025 \n\n20 \n \n\n \n[..]\n
\nListing 1: Excerpt of the Schema Definition for\nIt\nemmarketprice\nTh\ne web server log has a special formatting, an example\n4Javassist project homepage - http://www.csg.is.titech.\nac.jp/~chiba/javassist/is shown in Figure 2. To generate a realistic web log, we\nspecified a table in PDGF that has all required columns for\na web log entry and formated it using PDGF’s scripting ca-\npabilities. Below in Listing 2 an excerpt of the definition of\nthe web server log table can be seen. The excerpt shows the\ndefinition of the size of the web log, and the table defini-\ntion with two attributes. The sizing is computed according\nto the formula in Section 3.1.2, the specification of the pa-\nrameters of the formula is omitted. For the table itself only\ntwo attributes are shown: a surrogate key wcsclickskand\nt\nhe reference to the web page wcswebpagesk. This ref-\ne\nrence is null with a probability of 0 .00025. In Listing 3,\nthe formatting code for the web log can be seen. As shown\nin the listing, some of the values in the log are static. For\nexample the request IP address is always “127.0.0.1” while\nother values such as the time and date are extracted from\nthe table.\n\n($\n{sales} * (${pages_per_item} + (${pages_to_buy}\n/ ${items_per_cart})))\n+ (${sales} * ${buy_ratio} * ${pages_per_item})\n \n\n${Web_clickstreams}\n\n \n \n[..]\n\n\n0.00025 \n\n1 \n${web_page} \n \n\n \n[..]\n
\nListing 2: Excerpt of the web log specification\n\n
- DataForgeTest
+ SmartDataTest
You need to enable JavaScript to run this app.
diff --git a/frontend/src/components/HomePage.js b/frontend/src/components/HomePage.js
index 3bffea7..b270eb6 100644
--- a/frontend/src/components/HomePage.js
+++ b/frontend/src/components/HomePage.js
@@ -28,7 +28,7 @@ const DataQualityLLMSystem = () => {
navMethodology: 'Metodologia',
navChecklist: 'Checklist QA',
logout: 'Sair',
- heroTitle: 'DataForgeTest\nTestes de Qualidade para Big Data',
+ heroTitle: 'SmartDataTest\nTestes de Qualidade para Big Data',
heroSubtitle: 'Testes avançados de qualidade com métricas, suporte LLM + RAG e\ngeração automatizada de código PySpark',
btnChecklist: 'Checklist Support QA',
btnGenerate: 'Gerar Dataset',
@@ -38,7 +38,7 @@ const DataQualityLLMSystem = () => {
sectionProblems: 'Cenários de Qualidade de Dados',
sectionTips: 'Diretrizes de Implementação',
sectionFuture: 'Roadmap de Funcionalidades Futuras',
- footerCopyright: '© 2026 DataForgeTest. Todos os direitos reservados.',
+ footerCopyright: '© 2026 SmartDataTest. Todos os direitos reservados.',
footerRights: 'Plataforma de Automação de Qualidade de Dados para Big Data com LLM + RAG.',
footerBuiltWith: 'Desenvolvido com',
footerTech: 'React · Python · PySpark · LLM · RAG',
@@ -48,7 +48,7 @@ const DataQualityLLMSystem = () => {
navMethodology: 'Methodology',
navChecklist: 'QA Checklist',
logout: 'Logout',
- heroTitle: 'DataForgeTest\nBig Data Quality Testing',
+ heroTitle: 'SmartDataTest\nBig Data Quality Testing',
heroSubtitle: 'Advanced data quality testing with metrics, LLM + RAG support, and\nautomated PySpark code generation',
btnChecklist: 'Checklist Support QA',
btnGenerate: 'Generate Dataset',
@@ -58,7 +58,7 @@ const DataQualityLLMSystem = () => {
sectionProblems: 'Data Quality Scenarios',
sectionTips: 'Implementation Guidelines',
sectionFuture: 'Future Features Roadmap',
- footerCopyright: '© 2026 DataForgeTest. All rights reserved.',
+ footerCopyright: '© 2026 SmartDataTest. All rights reserved.',
footerRights: 'Data Quality Automation Platform for Big Data with LLM + RAG.',
footerBuiltWith: 'Built with',
footerTech: 'React · Python · PySpark · LLM · RAG',
@@ -76,7 +76,7 @@ const DataQualityLLMSystem = () => {
⚡
- DataForgeTest
+ SmartDataTest
@@ -123,7 +123,7 @@ const DataQualityLLMSystem = () => {
- DataForgeTest
+ SmartDataTest
·
{t.footerCopyright}
@@ -741,7 +741,7 @@ const DataQualityLLMSystem = () => {
{t.sectionFuture}
- Innovative features planned to enhance your DataForgeTest platform
+ Innovative features planned to enhance your SmartDataTest platform
{/* Feature Navigation */}
diff --git a/frontend/src/context/LanguageContext.js b/frontend/src/context/LanguageContext.js
index 7289c97..5d5bc1e 100644
--- a/frontend/src/context/LanguageContext.js
+++ b/frontend/src/context/LanguageContext.js
@@ -1,6 +1,6 @@
import React, { createContext, useContext, useState } from 'react';
-const LANG_KEY = 'dataforgetest_language';
+const LANG_KEY = 'smartdatatest_language';
const LanguageContext = createContext(null);
diff --git a/frontend/src/data/users.js b/frontend/src/data/users.js
index bb5b4b4..4282dc5 100644
--- a/frontend/src/data/users.js
+++ b/frontend/src/data/users.js
@@ -13,7 +13,7 @@ export const REGISTERED_USERS = [
{
id: 'user-admin-001',
name: 'Admin DataForge',
- email: 'admin@dataforgetest.com',
+ email: 'admin@smartdatatest.com',
// Plain password stored only for frontend demo — migrate to backend auth
password: 'admin123',
role: 'admin',
@@ -23,7 +23,7 @@ export const REGISTERED_USERS = [
{
id: 'user-eng-002',
name: 'Engineer DataForge',
- email: 'engineer@dataforgetest.com',
+ email: 'engineer@smartdatatest.com',
password: 'engineer123',
role: 'data_eng',
avatar: null,
@@ -32,7 +32,7 @@ export const REGISTERED_USERS = [
{
id: 'user-qa-003',
name: 'QA DataForge',
- email: 'qa@dataforgetest.com',
+ email: 'qa@smartdatatest.com',
password: 'qa123456',
role: 'tester',
avatar: null,
diff --git a/frontend/src/pages/DataAccuracy.js b/frontend/src/pages/DataAccuracy.js
index ec7a21f..ed935e2 100644
--- a/frontend/src/pages/DataAccuracy.js
+++ b/frontend/src/pages/DataAccuracy.js
@@ -33,7 +33,7 @@ const DataAccuracy = () => {
// Focus on page load
useEffect(() => {
- document.title = 'Acurácia de Dados - DataForgeTest';
+ document.title = 'Acurácia de Dados - SmartDataTest';
}, []);
const handleGoldFileSelect = async (file) => {
diff --git a/frontend/src/pages/DatasetMetrics.js b/frontend/src/pages/DatasetMetrics.js
index 72c9cd3..7db5166 100644
--- a/frontend/src/pages/DatasetMetrics.js
+++ b/frontend/src/pages/DatasetMetrics.js
@@ -25,7 +25,7 @@ const DatasetMetrics = () => {
// Focus management
useEffect(() => {
- document.title = 'Dataset Metrics - DataForgeTest';
+ document.title = 'Dataset Metrics - SmartDataTest';
}, []);
// Handle file selection
diff --git a/frontend/src/pages/LoginPage.js b/frontend/src/pages/LoginPage.js
index 1702288..1cff489 100644
--- a/frontend/src/pages/LoginPage.js
+++ b/frontend/src/pages/LoginPage.js
@@ -43,7 +43,7 @@ import {
// ---------------------------------------------------------------------------
const translations = {
'pt-BR': {
- platformName: 'DataForgeTest',
+ platformName: 'SmartDataTest',
loginTitle: 'Bem-vindo de volta',
loginSubtitle: 'Faça login para acessar a plataforma de QA em Big Data',
emailLabel: 'E-mail',
@@ -53,9 +53,9 @@ const translations = {
loginButton: 'Entrar',
loginButtonLoading: 'Autenticando...',
demoCredentials: 'Credenciais de demonstração',
- demoAdmin: 'Admin: admin@dataforgetest.com / admin123',
- demoEngineer: 'Engenheiro: engineer@dataforgetest.com / engineer123',
- demoQa: 'QA: qa@dataforgetest.com / qa123456',
+ demoAdmin: 'Admin: admin@smartdatatest.com / admin123',
+ demoEngineer: 'Engenheiro: engineer@smartdatatest.com / engineer123',
+ demoQa: 'QA: qa@smartdatatest.com / qa123456',
profileTitle: 'Quase lá!',
profileSubtitle: 'Personalize sua experiência na plataforma',
profileQuestion: 'Qual é o seu perfil profissional?',
@@ -80,14 +80,14 @@ const translations = {
coverage: 'Cobertura',
response: 'Resposta',
},
- footerCopyright: '© 2026 DataForgeTest. Todos os direitos reservados.',
+ footerCopyright: '© 2026 SmartDataTest. Todos os direitos reservados.',
footerRights: 'Plataforma de qualidade de dados com suporte de IA — Uso educacional e profissional.',
footerBuiltWith: 'Desenvolvido com',
footerTech: 'React + Flask + Python 3.12',
loading: 'Carregando...',
},
'en-US': {
- platformName: 'DataForgeTest',
+ platformName: 'SmartDataTest',
loginTitle: 'Welcome back',
loginSubtitle: 'Sign in to access the Big Data QA platform',
emailLabel: 'Email',
@@ -97,9 +97,9 @@ const translations = {
loginButton: 'Sign In',
loginButtonLoading: 'Authenticating...',
demoCredentials: 'Demo credentials',
- demoAdmin: 'Admin: admin@dataforgetest.com / admin123',
- demoEngineer: 'Engineer: engineer@dataforgetest.com / engineer123',
- demoQa: 'QA: qa@dataforgetest.com / qa123456',
+ demoAdmin: 'Admin: admin@smartdatatest.com / admin123',
+ demoEngineer: 'Engineer: engineer@smartdatatest.com / engineer123',
+ demoQa: 'QA: qa@smartdatatest.com / qa123456',
profileTitle: 'Almost there!',
profileSubtitle: 'Personalize your platform experience',
profileQuestion: 'What is your professional profile?',
@@ -124,7 +124,7 @@ const translations = {
coverage: 'Coverage',
response: 'Response',
},
- footerCopyright: '© 2026 DataForgeTest. All rights reserved.',
+ footerCopyright: '© 2026 SmartDataTest. All rights reserved.',
footerRights: 'AI-powered data quality platform — Educational and professional use.',
footerBuiltWith: 'Built with',
footerTech: 'React + Flask + Python 3.12',
@@ -212,7 +212,7 @@ function TopBar() {
⚡
- DataForgeTest
+ SmartDataTest
diff --git a/frontend/src/pages/SupportPage.js b/frontend/src/pages/SupportPage.js
index f07b4d2..9ebeb90 100644
--- a/frontend/src/pages/SupportPage.js
+++ b/frontend/src/pages/SupportPage.js
@@ -21,7 +21,7 @@ const SupportPage = () => {
variants={slideIn}
className="text-4xl md:text-5xl font-bold mb-6 bg-clip-text text-transparent bg-gradient-to-r from-purple-400 to-pink-600"
>
- DataForgeTest Support
+ SmartDataTest Support
{
// Focus management
useEffect(() => {
- document.title = 'Test Dataset GOLD - DataForgeTest';
+ document.title = 'Test Dataset GOLD - SmartDataTest';
}, []);
// Polling for status
diff --git a/frontend/src/utils/authStorage.js b/frontend/src/utils/authStorage.js
index 83b5078..53a8456 100644
--- a/frontend/src/utils/authStorage.js
+++ b/frontend/src/utils/authStorage.js
@@ -1,12 +1,12 @@
/**
* Auth storage utilities.
*
- * SESSION_KEY: 'dataforgetest_session'
+ * SESSION_KEY: 'smartdatatest_session'
* Stores: {userId, name, email, role, avatar, profile, loginAt, expiresAt}
* ⚠️ passwordHash is NEVER stored.
*/
-export const SESSION_KEY = 'dataforgetest_session';
+export const SESSION_KEY = 'smartdatatest_session';
/**
* Save user session to localStorage.
diff --git a/tests/frontend/README.md b/tests/frontend/README.md
index a83eba6..f1548d8 100644
--- a/tests/frontend/README.md
+++ b/tests/frontend/README.md
@@ -1,6 +1,6 @@
# Frontend Tests
-Este diretório contém os testes de frontend do DataForgeTest, organizados por tipo.
+Este diretório contém os testes de frontend do SmartDataTest, organizados por tipo.
## Estrutura
diff --git a/tests/frontend/integration/RAGIntegration.test.js b/tests/frontend/integration/RAGIntegration.test.js
index 16d694c..ab8a57e 100644
--- a/tests/frontend/integration/RAGIntegration.test.js
+++ b/tests/frontend/integration/RAGIntegration.test.js
@@ -159,7 +159,7 @@ describe('RAG Integration Tests', () => {
render( );
// Verify page title
- expect(screen.getByText('DataForgeTest Support')).toBeInTheDocument();
+ expect(screen.getByText('SmartDataTest Support')).toBeInTheDocument();
// Verify page description
expect(screen.getByText(/Get help with your data quality testing setup using our AI-powered documentation assistant/)).toBeInTheDocument();
@@ -173,7 +173,7 @@ describe('RAG Integration Tests', () => {
render( );
// Verify the complete page structure is rendered
- expect(screen.getByText('DataForgeTest Support')).toBeInTheDocument();
+ expect(screen.getByText('SmartDataTest Support')).toBeInTheDocument();
expect(screen.getByTestId('chat-window')).toBeInTheDocument();
// Verify chat functionality components
diff --git a/tests/frontend/unit/DatasetMetrics.test.js b/tests/frontend/unit/DatasetMetrics.test.js
index ff07fad..7e1af20 100644
--- a/tests/frontend/unit/DatasetMetrics.test.js
+++ b/tests/frontend/unit/DatasetMetrics.test.js
@@ -373,7 +373,7 @@ describe('DatasetMetrics Component', () => {
describe('Accessibility', () => {
test('sets document title', () => {
renderWithRouter( );
- expect(document.title).toBe('Dataset Metrics - DataForgeTest');
+ expect(document.title).toBe('Dataset Metrics - SmartDataTest');
});
test('has proper button roles', () => {
diff --git a/tests/frontend/unit/LanguageContext.test.js b/tests/frontend/unit/LanguageContext.test.js
index 68e4738..89d017f 100644
--- a/tests/frontend/unit/LanguageContext.test.js
+++ b/tests/frontend/unit/LanguageContext.test.js
@@ -9,7 +9,7 @@ import '@testing-library/jest-dom';
import { LanguageProvider, useLanguage } from '../../../frontend/src/context/LanguageContext';
-const LANG_KEY = 'dataforgetest_language';
+const LANG_KEY = 'smartdatatest_language';
beforeEach(() => {
localStorage.clear();
diff --git a/tests/frontend/unit/LoginPage.profile.test.js b/tests/frontend/unit/LoginPage.profile.test.js
index c11f6ae..eed60fb 100644
--- a/tests/frontend/unit/LoginPage.profile.test.js
+++ b/tests/frontend/unit/LoginPage.profile.test.js
@@ -356,12 +356,12 @@ describe('LoginPage — Login form loading and error states', () => {
renderPage();
const emailInput = document.querySelector('input[type="email"]');
const passwordInput = document.querySelector('input[type="password"]');
- fireEvent.change(emailInput, { target: { value: 'admin@dataforgetest.com' } });
+ fireEvent.change(emailInput, { target: { value: 'admin@smartdatatest.com' } });
fireEvent.change(passwordInput, { target: { value: 'admin123' } });
const form = document.querySelector('form');
fireEvent.submit(form);
await waitFor(() => {
- expect(mockHandleLogin).toHaveBeenCalledWith('admin@dataforgetest.com', 'admin123', false);
+ expect(mockHandleLogin).toHaveBeenCalledWith('admin@smartdatatest.com', 'admin123', false);
});
await waitFor(() => {
expect(screen.getByText(/Quase lá/i)).toBeInTheDocument();
diff --git a/tests/frontend/unit/LoginPage.test.js b/tests/frontend/unit/LoginPage.test.js
index 9fce122..d9e39cb 100644
--- a/tests/frontend/unit/LoginPage.test.js
+++ b/tests/frontend/unit/LoginPage.test.js
@@ -136,7 +136,7 @@ describe('LoginPage — Step 1: Login Form', () => {
test('renders login title in PT-BR by default', () => {
renderLoginPage();
- const elements = screen.getAllByText(/DataForgeTest/i);
+ const elements = screen.getAllByText(/SmartDataTest/i);
expect(elements.length).toBeGreaterThan(0);
});
diff --git a/tests/frontend/unit/SupportPage.test.js b/tests/frontend/unit/SupportPage.test.js
index 00e1a56..5f52273 100644
--- a/tests/frontend/unit/SupportPage.test.js
+++ b/tests/frontend/unit/SupportPage.test.js
@@ -37,7 +37,7 @@ describe('SupportPage Integration Tests', () => {
test('renders SupportPage with title and description', () => {
render( );
- expect(screen.getByText(/DataForgeTest Support/i)).toBeInTheDocument();
+ expect(screen.getByText(/SmartDataTest Support/i)).toBeInTheDocument();
expect(screen.getByText(/Get help with your data quality testing setup using our AI-powered documentation assistant/i)).toBeInTheDocument();
});
@@ -77,6 +77,6 @@ describe('SupportPage Integration Tests', () => {
expect(screen.getByTestId('message-circle-icon')).toBeInTheDocument();
// Check title is present
- expect(screen.getByText(/DataForgeTest Support/i)).toBeInTheDocument();
+ expect(screen.getByText(/SmartDataTest Support/i)).toBeInTheDocument();
});
});
diff --git a/tests/frontend/unit/TestDatasetGold.test.js b/tests/frontend/unit/TestDatasetGold.test.js
index 781a53c..a64e891 100644
--- a/tests/frontend/unit/TestDatasetGold.test.js
+++ b/tests/frontend/unit/TestDatasetGold.test.js
@@ -80,7 +80,7 @@ describe('TestDatasetGold Component', () => {
test('sets document title', () => {
renderWithRouter( );
- expect(document.title).toBe('Test Dataset GOLD - DataForgeTest');
+ expect(document.title).toBe('Test Dataset GOLD - SmartDataTest');
});
test('has proper navigation structure', () => {
diff --git a/tests/frontend/unit/authStorage.test.js b/tests/frontend/unit/authStorage.test.js
index 146576d..5c391c7 100644
--- a/tests/frontend/unit/authStorage.test.js
+++ b/tests/frontend/unit/authStorage.test.js
@@ -11,7 +11,7 @@ import {
hasProfile,
} from '../../../frontend/src/utils/authStorage';
-const SESSION_KEY = 'dataforgetest_session';
+const SESSION_KEY = 'smartdatatest_session';
const mockUser = {
id: 'user-1',
diff --git a/tests/frontend/unit/useAuth.test.js b/tests/frontend/unit/useAuth.test.js
index 803c838..6eac4ed 100644
--- a/tests/frontend/unit/useAuth.test.js
+++ b/tests/frontend/unit/useAuth.test.js
@@ -18,7 +18,7 @@ jest.mock('../../../frontend/src/data/users', () => ({
{
id: 'user-1',
name: 'Admin User',
- email: 'admin@dataforgetest.com',
+ email: 'admin@smartdatatest.com',
password: 'admin123',
role: 'admin',
avatar: null,
@@ -68,7 +68,7 @@ describe('useAuth', () => {
const { result } = renderHook(() => useAuth());
let loginResult;
await act(async () => {
- const promise = result.current.handleLogin('admin@dataforgetest.com', 'admin123', false);
+ const promise = result.current.handleLogin('admin@smartdatatest.com', 'admin123', false);
jest.advanceTimersByTime(1200);
loginResult = await promise;
});
@@ -93,7 +93,7 @@ describe('useAuth', () => {
const { result } = renderHook(() => useAuth());
let loginResult;
await act(async () => {
- const promise = result.current.handleLogin('admin@dataforgetest.com', 'wrongpass', false);
+ const promise = result.current.handleLogin('admin@smartdatatest.com', 'wrongpass', false);
jest.advanceTimersByTime(1200);
loginResult = await promise;
});
diff --git a/tests/frontend/unit/users.test.js b/tests/frontend/unit/users.test.js
index 6f9c6c2..2f15697 100644
--- a/tests/frontend/unit/users.test.js
+++ b/tests/frontend/unit/users.test.js
@@ -36,21 +36,21 @@ describe('REGISTERED_USERS', () => {
});
test('admin user exists with correct role', () => {
- const admin = REGISTERED_USERS.find((u) => u.email === 'admin@dataforgetest.com');
+ const admin = REGISTERED_USERS.find((u) => u.email === 'admin@smartdatatest.com');
expect(admin).toBeDefined();
expect(admin.role).toBe('admin');
expect(admin.password).toBe('admin123');
});
test('engineer user exists with correct role', () => {
- const eng = REGISTERED_USERS.find((u) => u.email === 'engineer@dataforgetest.com');
+ const eng = REGISTERED_USERS.find((u) => u.email === 'engineer@smartdatatest.com');
expect(eng).toBeDefined();
expect(eng.role).toBe('data_eng');
expect(eng.password).toBe('engineer123');
});
test('qa user exists with correct role', () => {
- const qa = REGISTERED_USERS.find((u) => u.email === 'qa@dataforgetest.com');
+ const qa = REGISTERED_USERS.find((u) => u.email === 'qa@smartdatatest.com');
expect(qa).toBeDefined();
expect(qa.role).toBe('tester');
expect(qa.password).toBe('qa123456');
From edd75ea59da8ffadf6c866896cfd1cf4bad20e24 Mon Sep 17 00:00:00 2001
From: Icar0S
Date: Thu, 19 Mar 2026 23:53:57 -0300
Subject: [PATCH 17/17] refactor: update terminology from DSL to JSON in
components and tests
---
.../src/pages/AdvancedPySparkGenerator.js | 14 +++---
frontend/src/pages/QaChecklist.js | 8 ++--
.../unit/AdvancedPySparkGenerator.test.js | 48 +++++++++----------
tests/frontend/unit/QaChecklist.test.js | 10 ++--
4 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/frontend/src/pages/AdvancedPySparkGenerator.js b/frontend/src/pages/AdvancedPySparkGenerator.js
index 3ed3dd5..51c1727 100644
--- a/frontend/src/pages/AdvancedPySparkGenerator.js
+++ b/frontend/src/pages/AdvancedPySparkGenerator.js
@@ -116,7 +116,7 @@ const AdvancedPySparkGenerator = () => {
});
if (!response.ok) {
- let errorMessage = 'Failed to generate DSL';
+ let errorMessage = 'Failed to generate JSON';
try {
const errorData = await response.json();
errorMessage = errorData.error || errorMessage;
@@ -153,7 +153,7 @@ const AdvancedPySparkGenerator = () => {
try {
finalDsl = JSON.parse(dslText);
} catch (e) {
- throw new Error('Invalid DSL JSON: ' + e.message);
+ throw new Error('Invalid JSON: ' + e.message);
}
}
@@ -508,10 +508,10 @@ const AdvancedPySparkGenerator = () => {
className="flex-1 px-6 py-3 bg-gradient-to-r from-purple-600 to-pink-600 text-white rounded-xl font-semibold disabled:opacity-50 disabled:cursor-not-allowed hover:shadow-lg hover:shadow-purple-500/30 transition-all duration-300 flex items-center justify-center gap-2"
>
{isLoading ? (
- <>Generating DSL...>
+ <>Generating JSON...>
) : (
<>
- Generate DSL
+ Generate JSON
>
)}
@@ -525,12 +525,12 @@ const AdvancedPySparkGenerator = () => {
- Step 3: Review and Edit DSL
+ Step 3: Review and Edit JSON
- Review the generated Data Specification Language (DSL). You can edit it directly if needed.
+ Review the generated JSON. You can edit it directly if needed.
@@ -664,7 +664,7 @@ const AdvancedPySparkGenerator = () => {
{step === 1 && 'Upload'}
{step === 2 && 'Review'}
- {step === 3 && 'DSL'}
+ {step === 3 && 'JSON'}
{step === 4 && 'Code'}
diff --git a/frontend/src/pages/QaChecklist.js b/frontend/src/pages/QaChecklist.js
index 0435eb1..9d9e7d5 100644
--- a/frontend/src/pages/QaChecklist.js
+++ b/frontend/src/pages/QaChecklist.js
@@ -128,7 +128,7 @@ const QaChecklist = () => {
});
if (!response.ok) {
- throw new Error('Failed to generate DSL and PySpark code');
+ throw new Error('Failed to generate JSON and PySpark code');
}
const data = await response.json();
@@ -203,7 +203,7 @@ const QaChecklist = () => {
{/* Success message */}
-
DSL e código PySpark gerados com sucesso!
+
JSON e código PySpark gerados com sucesso!
{/* Error display */}
@@ -215,7 +215,7 @@ const QaChecklist = () => {
{/* DSL Section */}
-
DSL (Domain Specific Language)
+
JSON
{
) : (
<>
- Gerar DSL e PySpark
+ Gerar JSON e PySpark
>
)}
diff --git a/tests/frontend/unit/AdvancedPySparkGenerator.test.js b/tests/frontend/unit/AdvancedPySparkGenerator.test.js
index 83a67a0..404f08e 100644
--- a/tests/frontend/unit/AdvancedPySparkGenerator.test.js
+++ b/tests/frontend/unit/AdvancedPySparkGenerator.test.js
@@ -185,7 +185,7 @@ describe('AdvancedPySparkGenerator Component', () => {
expect(screen.getByText(/Step 1: Upload Dataset/i)).toBeInTheDocument();
});
- test('proceeds to step 3 after generate DSL', async () => {
+ test('proceeds to step 3 after generate JSON', async () => {
fetch.mockResolvedValueOnce({
ok: true,
json: async () => mockMetadata,
@@ -200,14 +200,14 @@ describe('AdvancedPySparkGenerator Component', () => {
fireEvent.change(input, { target: { files: [file] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
await waitFor(() => {
- expect(screen.getByText(/Step 3: Review and Edit DSL/i)).toBeInTheDocument();
+ expect(screen.getByText(/Step 3: Review and Edit JSON/i)).toBeInTheDocument();
});
});
});
- describe('Step 3 - DSL review', () => {
+ describe('Step 3 - JSON review', () => {
const goToStep3 = async () => {
fetch.mockResolvedValueOnce({
ok: true,
@@ -223,13 +223,13 @@ describe('AdvancedPySparkGenerator Component', () => {
fireEvent.change(input, { target: { files: [file] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
};
- test('shows DSL editor in step 3', async () => {
+ test('shows JSON editor in step 3', async () => {
await goToStep3();
- expect(screen.getByText(/Step 3: Review and Edit DSL/i)).toBeInTheDocument();
+ expect(screen.getByText(/Step 3: Review and Edit JSON/i)).toBeInTheDocument();
});
test('back button returns to step 2 from step 3', async () => {
@@ -278,8 +278,8 @@ describe('AdvancedPySparkGenerator Component', () => {
fireEvent.change(input, { target: { files: [file] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
fireEvent.click(screen.getByRole('button', { name: /Generate PySpark Code/i }));
await waitFor(() => screen.getByText(/Step 4: PySpark Code/i));
};
@@ -424,8 +424,8 @@ describe('AdvancedPySparkGenerator - Metadata with columns', () => {
fireEvent.change(input, { target: { files: [file] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
fireEvent.click(screen.getByRole('button', { name: /Generate PySpark Code/i }));
await waitFor(() => screen.getByText(/Step 4: PySpark Code/i));
@@ -433,7 +433,7 @@ describe('AdvancedPySparkGenerator - Metadata with columns', () => {
});
});
-describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
+describe('AdvancedPySparkGenerator - JSON and Error Handling', () => {
beforeEach(() => {
fetch.mockClear();
jest.clearAllMocks();
@@ -447,9 +447,9 @@ describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
preview: [{ id: 1 }],
};
- test('shows error when DSL generation fails', async () => {
+ test('shows error when JSON generation fails', async () => {
fetch.mockResolvedValueOnce({ ok: true, json: async () => simpleMetadata });
- fetch.mockResolvedValueOnce({ ok: false, json: async () => ({ error: 'DSL generation failed' }) });
+ fetch.mockResolvedValueOnce({ ok: false, json: async () => ({ error: 'JSON generation failed' }) });
renderWithRouter( );
const input = document.querySelector('input[type="file"]');
@@ -457,9 +457,9 @@ describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
await waitFor(() => {
- expect(screen.queryByText(/Step 3: Review and Edit DSL/i)).not.toBeInTheDocument();
+ expect(screen.queryByText(/Step 3: Review and Edit JSON/i)).not.toBeInTheDocument();
});
});
@@ -473,15 +473,15 @@ describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
fireEvent.change(input, { target: { files: [new File(['content'], 'data.csv')] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
fireEvent.click(screen.getByRole('button', { name: /Generate PySpark Code/i }));
await waitFor(() => {
expect(screen.queryByText(/Step 4: PySpark Code/i)).not.toBeInTheDocument();
});
});
- test('allows editing DSL text in step 3', async () => {
+ test('allows editing JSON text in step 3', async () => {
fetch.mockResolvedValueOnce({ ok: true, json: async () => simpleMetadata });
fetch.mockResolvedValueOnce({ ok: true, json: async () => ({ dsl: { rules: [{ rule: 'test' }] } }) });
@@ -490,8 +490,8 @@ describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
fireEvent.change(input, { target: { files: [new File(['content'], 'data.csv')] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
const textarea = screen.getByRole('textbox');
fireEvent.change(textarea, { target: { value: '{"custom": "dsl"}' } });
@@ -508,8 +508,8 @@ describe('AdvancedPySparkGenerator - DSL and Error Handling', () => {
fireEvent.change(input, { target: { files: [new File(['content'], 'data.csv')] } });
fireEvent.click(screen.getByRole('button', { name: /Inspect Dataset/i }));
await waitFor(() => screen.getByText(/Step 2: Review Dataset Metadata/i));
- fireEvent.click(screen.getByRole('button', { name: /Generate DSL/i }));
- await waitFor(() => screen.getByText(/Step 3: Review and Edit DSL/i));
+ fireEvent.click(screen.getByRole('button', { name: /Generate JSON/i }));
+ await waitFor(() => screen.getByText(/Step 3: Review and Edit JSON/i));
fireEvent.click(screen.getByRole('button', { name: /Generate PySpark Code/i }));
await waitFor(() => screen.getByText(/Step 4: PySpark Code/i));
diff --git a/tests/frontend/unit/QaChecklist.test.js b/tests/frontend/unit/QaChecklist.test.js
index a437450..8f2a387 100644
--- a/tests/frontend/unit/QaChecklist.test.js
+++ b/tests/frontend/unit/QaChecklist.test.js
@@ -169,15 +169,15 @@ describe('QaChecklist Component', () => {
// Fill and submit
fireEvent.change(textarea, { target: { value: 'start_date:<:end_date' } });
- const submitButton = screen.getByRole('button', { name: /Gerar DSL e PySpark/i });
+ const submitButton = screen.getByRole('button', { name: /Gerar JSON e PySpark/i });
fireEvent.click(submitButton);
// Should show success message and results
await waitFor(() => {
- expect(screen.getByText(/DSL e código PySpark gerados com sucesso/)).toBeInTheDocument();
+ expect(screen.getByText(/JSON e código PySpark gerados com sucesso/)).toBeInTheDocument();
});
- expect(screen.getByText('DSL (Domain Specific Language)')).toBeInTheDocument();
+ expect(screen.getByText('JSON')).toBeInTheDocument();
expect(screen.getByText('Código PySpark')).toBeInTheDocument();
});
@@ -240,12 +240,12 @@ describe('QaChecklist Component', () => {
// Submit on last question
fireEvent.change(textarea, { target: { value: 'start_date:<:end_date' } });
- const submitButton = screen.getByRole('button', { name: /Gerar DSL e PySpark/i });
+ const submitButton = screen.getByRole('button', { name: /Gerar JSON e PySpark/i });
fireEvent.click(submitButton);
// Should show error
await waitFor(() => {
- expect(screen.getByText(/Failed to generate DSL and PySpark code/)).toBeInTheDocument();
+ expect(screen.getByText(/Failed to generate JSON and PySpark code/)).toBeInTheDocument();
});
});