diff --git a/.gitignore b/.gitignore index e298743..2a0bbf2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ venv/ # Ignore data files data/ +# EXCEPT: frontend source data (user definitions) +!frontend/src/data/ # Ignore output files output/ diff --git a/docs/AUTH.md b/docs/AUTH.md new file mode 100644 index 0000000..6a7751f --- /dev/null +++ b/docs/AUTH.md @@ -0,0 +1,79 @@ +# Auth + i18n — DataForgeTest + +## Autenticação (sem banco de dados) + +Fluxo: +``` +/login → useAuth.handleLogin() → compara com data/users.js → +authStorage.saveSession() → step='profile' → handleSaveProfile() → navigate('/') +``` + +### localStorage + +| Chave | Conteúdo | +|---|---| +| `dataforgetest_session` | `{userId, name, email, role, avatar, profile, loginAt, expiresAt}` | +| `dataforgetest_language` | `'pt-BR'` ou `'en-US'` | + +> ⚠️ **NUNCA** salvo: senha ou hash de senha + +### Expiração + +- Padrão: **8 horas** +- Com "Lembrar-me": **7 dias** + +--- + +## Migração para Backend (TODO) + +Em `useAuth.js`: trocar `REGISTERED_USERS` por `fetch('/api/auth/validate')`: + +```javascript +const res = await fetch(getApiUrl('/api/auth/validate'), { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ email, password }), +}); +const data = await res.json(); +``` + +Em `authStorage.js`: salvar JWT retornado +Em `ProtectedRoute.js`: validar JWT no header `Authorization` + +--- + +## Usuários Demo + +| E-mail | Senha | Role | +|---|---|---| +| admin@dataforgetest.com | admin123 | admin | +| engineer@dataforgetest.com | engineer123 | data_eng | +| qa@dataforgetest.com | qa123456 | tester | + +--- + +## i18n + +`LanguageContext` persiste a preferência de idioma em `'dataforgetest_language'`. + +Componente de toggle: `` — visual idêntico ao `MethodologyPage`. + +Para usar em qualquer componente: + +```javascript +import { useLanguage } from '../context/LanguageContext'; +const { language, changeLanguage } = useLanguage(); +``` + +--- + +## Backend: `/api/auth/validate` + +| Método | Rota | Body | Resposta | +|---|---|---|---| +| POST | `/api/auth/validate` | `{email, password}` | `200 {valid: true, user: {...}}` | +| POST | `/api/auth/validate` | senha errada | `401 {valid: false, error: "..."}` | +| POST | `/api/auth/validate` | email inválido | `401 {valid: false, error: "..."}` | +| POST | `/api/auth/validate` | campos ausentes | `400 {valid: false, error: "..."}` | + +> Resposta nunca inclui `password_hash`. diff --git a/docs_to_import/mrs_oliveira2025/all_posts_mined.csv b/docs_to_import/mrs_oliveira2025/all_posts_mined.csv new file mode 100644 index 0000000..d717b18 --- /dev/null +++ b/docs_to_import/mrs_oliveira2025/all_posts_mined.csv @@ -0,0 +1,4091 @@ +Link +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93 +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93 +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93 +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93 +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://sqa.stackexchange.com/questions/37718/big-data-application-testing +https://sqa.stackexchange.com/questions/37718/big-data-application-testing +https://sqa.stackexchange.com/questions/37718/big-data-application-testing +https://sqa.stackexchange.com/questions/37718/big-data-application-testing +https://sqa.stackexchange.com/questions/37718/big-data-application-testing diff --git a/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv b/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv new file mode 100644 index 0000000..7b12d1d --- /dev/null +++ b/docs_to_import/mrs_oliveira2025/cleaned_all_posts_mined.csv @@ -0,0 +1,761 @@ +Link +https://dev.to/dataform/testing-data-quality-with-sql-assertions-248g +https://dev.to/finnauto/dbt-for-data-quality-testing-alerting-at-finn-5e7n +https://dev.to/aws-builders/data-quality-at-scale-with-great-expectations-spark-and-airflow-on-emr-5bnm +https://dev.to/ranjbaryshahab/improving-data-quality-in-clickhouse-databases-with-soda-4kp4 +https://dev.to/umaprasad/how-do-you-automate-big-data-testing-everything-to-know-3f90 +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp +https://dev.to/andyb1979/how-fast-is-scicharts-ios-chart-5hl1 +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22 +https://dev.to/oyedeletemitope/10-reasons-for-flaky-tests-5a63 +https://dev.to/supercokyle/why-data-quality-is-key-to-successful-ml-ops-ngk +https://dev.to/mbogan/five-data-quality-tools-you-should-know-5gkd +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730 +https://dev.to/taqkarim/self-grading-quizzes-with-airtable-3o5j +https://dev.to/voxel51/data-quality-the-hidden-driver-of-ai-success-2i63 +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo +https://dev.to/aws-heroes/how-to-check-for-quality-evaluate-data-with-aws-glue-data-quality-25nb +https://dev.to/grayhat/transform-your-data-like-a-pro-with-dbt-data-build-tool-39kd +https://dev.to/rdagumampan/run-environment-aware-database-migrations-with-yuniql-522l +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl +https://dev.to/kjpctech/efficient-iteration-of-big-data-in-django-354m +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006 +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a +https://dev.to/katalon/top-software-testing-trends-to-watch-out-for-in-2020-49fp +https://dev.to/andyb1979/scichart-is-the-fastest-js-chart-library-available-3o3c +https://dev.to/m1pko/data-quality-technical-debt-from-hell +https://dev.to/doriansabitov/optimizing-salesforce-data-integration-tools-and-best-practices-2g2i +https://dev.to/chris_bertrand/don-t-believe-the-hype-4ccb +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8 +https://dev.to/tom_millner/re-invent-2020-part-ii-data-sessions-reviewed-1n47 +https://dev.to/devsatasurion/death-of-the-coding-test-interview-methods-that-better-evaluate-candidate-competency-flj +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf +https://dev.to/lilian_kigunda_f7941bbc6a/the-rise-of-bioinformatics-clouds-a-new-era-for-big-data-in-life-sciences-nag +https://dev.to/documatic/data-privacy-laws-navigating-compliance-in-the-age-of-big-data-gic +https://dev.to/aws-builders/how-i-crushed-my-aws-certification-renewals-back-to-back-and-why-it-was-a-bad-idea-56fh +https://dev.to/namnguyen +https://dev.to/whokilledkevin/how-i-tried-a-new-tool-for-recruiting-letters-2gj +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5 +https://dev.to/codexam/why-is-big-data-important-40ha +https://dev.to/anil_csimplifyit_905c/challenges-and-solutions-in-implementing-ai-for-software-testing-2533 +https://dev.to/rishabk7/passed-aws-solutions-architect-associate-5h2j +https://dev.to/cloudtech/starting-your-journey-with-big-data-analytics-5ceo +https://dev.to/potloc/data-analytics-at-potloc-i-making-data-integrity-your-priority-with-elementary-meltano-1ob +https://dev.to/adevintaspain/spark-unit-integration-and-end-to-end-tests-f52 +https://dev.to/jeremystan/airbnb-quality-data-for-all-280f +https://dev.to/mage_ai/understanding-dbt-data-build-tool-an-introduction-1e43 +https://dev.to/educative/what-is-big-data-characteristics-types-and-technologies-3op5?comments_sort=top +https://dev.to/nadawoud/the-amazing-skill-of-predicting-the-interview-5908 +https://dev.to/doriansabitov/how-to-simplify-large-salesforce-data-migration-52km +https://dev.to/_patrickgod/fetching-millions-of-rows-with-streams-in-node-js-487e +https://dev.to/daryashirokova +https://dev.to/supercokyle/your-data-tests-failed-now-what-4cl4 +https://dev.to/reneebetina +https://dev.to/balagmadhu/from-data-collection-to-model-deployment-key-deliverables-in-a-machine-learning-project-33c1 +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363 +https://dev.to/chixcancode/azure-back-to-school-2020-serverless-big-data-pipelines-data-storage-and-exploration-1m8a +https://dev.to/apssouza22/tech-lead-playbook-523 +https://dev.to/iskender83/securing-cloud-native-databases-and-big-data-solutions-2l56 +https://dev.to/mainulspace/big-data-storage-trends-and-insights-36gm +https://dev.to/pragyasapkota/lambda-architecture-revolutionizing-data-processing-for-big-data-253l?comments_sort=oldest +https://dev.to/lulu_liu_c90f973e2f954d7f/fortify-your-website-for-free-testing-the-power-of-safeline-bpm +https://dev.to/dataform +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii +https://dev.to/rootstack/tools-for-effective-dataops-implementation-5ce +https://dev.to/berthaw82414312 +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi +https://dev.to/tinybirdco +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm +https://dev.to/madgan95/introduction-to-big-data-analysis-4cg1 +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7 +https://dev.to/simonfrey/local-wordpress-plugin-development-with-docker-compose-nil +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i +https://dev.to/andyb1979/android-chart-performance-comparison-5ej7 +https://dev.to/habereder/comment/po6j +https://dev.to/bytebodger/litmus-tests-in-tech-1ll7 +https://dev.to/aestevezjimenez/gcp-professional-data-engineer-guide-september-2020-7lp +https://dev.to/donut87/one-assert-per-test---what-is-that-about-4l75 +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf +https://dev.to/ascendixtech/the-new-property-technology-how-big-data-disrupts-real-estate-5bbo?comments_sort=latest +https://dev.to/hughzurname/embracing-failure-a-journey-from-tester-to-tech-lead-3bo2 +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e +https://dev.to/dhrumitshukla/the-adaptive-big-data-layer-with-pentaho-data-integration-in-the-market--1f62 +https://dev.to/abdullah_haggag/building-a-big-data-playground-sandbox-for-learning-cgi +https://dev.to/contactsunny/installing-zsh-and-oh-my-zsh-on-windows-11-with-wsl2-1p5i +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db +https://dev.to/meghasharmaaaa/devops-toolchain-mlo +https://dev.to/documatic/top-6-php-code-quality-tools-2023-2kb1 +https://dev.to/t/testing/page/73 +https://dev.to/andyb1979/scichartjs-performance-demo-1-million-datapoints-in-under-15ms-50bd +https://dev.to/lubneuski_a1qa/full-cycle-testing-hands-on-tips-to-troubleshoot-qa-hurdles-1h7h +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm +https://dev.to/cwprogram/the-faults-of-algorithmic-coding-interview-tests-for-devops-4o49 +https://dev.to/itsukitatsuya11/cpp-vs-python-benchmark-testing-5a0p +https://dev.to/chaets/an-end-to-end-guide-to-dbt-data-build-tool-with-a-use-case-example-18mk +https://stackoverflow.com/questions/60900153/how-can-i-stream-big-data-to-google-cloud-storage +https://stackoverflow.com/questions/62267736/big-dataspark-sql-and-spark-dataframes-connection +https://stackoverflow.com/questions/64605008/language-detection-in-python-for-big-data +https://stackoverflow.com/questions/61174905/storing-big-data-on-a-mobile-device-ios-and-android-with-react-native-and-expo +https://stackoverflow.com/questions/64829534/how-to-improve-vectorized-sliding-window-for-big-data +https://stackoverflow.com/questions/63550138/efficient-way-to-send-big-data-between-main-process-and-renderer-process +https://stackoverflow.com/questions/60488810/what-are-the-best-practices-working-with-postgres-replication-slot-for-big-data +https://stackoverflow.com/questions/65342689/how-to-store-big-data-as-global-variables-in-dash-python +https://stackoverflow.com/questions/65033677/define-data-quality-rules-for-big-data +https://stackoverflow.com/questions/65458445/how-to-cache-big-data-in-memory-efficiently-in-complex-variables-across-execut +https://stackoverflow.com/questions/65418381/laravel-query-to-show-big-data-is-slow +https://stackoverflow.com/questions/65332910/how-to-plot-visualization-of-missing-values-for-big-data-in-r +https://stackoverflow.com/questions/65289092/python-mysql-insert-big-data +https://stackoverflow.com/questions/64531374/what-are-faster-ways-of-reading-big-data-set-and-apply-row-wise-operations-other +https://stackoverflow.com/questions/65225212/compute-time-difference-according-to-a-condition-and-for-big-data-with-pyspark +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter +https://stackoverflow.com/questions/63695750/logstash-jdbc-input-plugin-doesn-t-work-with-prepared-statements-enabled-and-w +https://stackoverflow.com/questions/64961961/shared-array-for-big-data +https://stackoverflow.com/questions/64805209/r-analyse-string-in-column-of-a-big-data-frame-and-give-value-in-a-separate-colu +https://stackoverflow.com/questions/63712214/pd-read-sav-and-pyreadstat-are-so-slow-how-can-i-speed-up-pandas-for-big-data-i +https://stackoverflow.com/questions/64572276/extract-columns-from-big-data-table-to-small-data-tables-and-save-in-a-list +https://stackoverflow.com/questions/64578127/chartjs-create-chart-with-big-data-and-fixed-labels +https://stackoverflow.com/questions/64413787/grpc-transfer-big-data-one-unary-call-is-slower-than-streaming +https://stackoverflow.com/questions/64476848/cogroupbykey-always-failed-on-big-data-pythonsdk +https://stackoverflow.com/questions/64475727/calculate-daily-mean-of-big-data-table-depending-on-calendar-year +https://stackoverflow.com/questions/64458754/string-agg-is-to-slow-with-big-data-and-i-need-a-faster-solution +https://stackoverflow.com/questions/64445194/pass-big-data-like-images-to-widget +https://stackoverflow.com/questions/64359172/any-way-to-do-this-query-faster-with-big-data +https://stackoverflow.com/questions/64336941/how-to-create-a-scatter-plot-of-a-really-big-data +https://stackoverflow.com/questions/64271351/iterating-through-big-data-with-pandas-large-and-small-dataframes +https://stackoverflow.com/questions/63774476/what-are-helpful-optimizations-in-r-for-big-data-sets +https://stackoverflow.com/questions/63484011/how-do-i-etl-big-data-between-2-sql-server +https://stackoverflow.com/questions/64014590/application-insights-with-big-data +https://stackoverflow.com/questions/63735023/how-to-simplify-text-comparison-for-big-data-set-where-text-meaning-is-same-but +https://stackoverflow.com/questions/63413805/ignite-write-big-data-in-a-pressure-test-io-write-and-read-time-tow-high +https://stackoverflow.com/questions/63390170/blazor-asynchronously-render-big-data +https://stackoverflow.com/questions/63378227/sqoop-big-data-how-to-import-an-address-field-with-a-comma-using-sqoop +https://stackoverflow.com/questions/61221081/random-forest-for-big-data +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler +https://stackoverflow.com/questions/63190729/realm-migration-with-big-data-base +https://stackoverflow.com/questions/63134926/regarding-nodejs-and-big-data +https://stackoverflow.com/questions/63126987/analyse-input-data-and-find-errors-in-input-in-big-data +https://stackoverflow.com/questions/63043467/how-to-fit-hierarchical-models-on-big-data-with-repeated-observations +https://stackoverflow.com/questions/62314917/sending-big-data-amount-to-google-cloud-iot-core +https://stackoverflow.com/questions/62969219/query-exceeded-resource-limits-in-bigquery-group-by-on-big-data +https://stackoverflow.com/questions/62566975/how-to-share-big-data-with-detail-view +https://stackoverflow.com/questions/62912231/bash-script-optimization-for-big-data +https://stackoverflow.com/questions/62906210/how-to-reduce-the-time-taken-working-on-a-big-data-frame +https://stackoverflow.com/questions/62873089/how-to-update-teradata-driver-in-talend-big-data-7-0 +https://stackoverflow.com/questions/62860410/cloud-firestore-big-data-error-deadline-exceeded +https://stackoverflow.com/questions/62849389/non-relational-database-design-for-big-data-warehouse +https://stackoverflow.com/questions/62855643/make-piece-of-code-efficient-for-big-data +https://stackoverflow.com/questions/62267686/database-restoration-problem-on-sql-server-big-data-cluster +https://stackoverflow.com/questions/62722717/how-to-get-some-subset-of-data-from-a-csv-file-for-big-datacomparing-csvs +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data +https://stackoverflow.com/questions/62608168/how-to-rename-mongodb-columns-big-data +https://stackoverflow.com/questions/62427093/django-and-amazon-lambda-best-solution-for-big-data-with-amazon-rds-or-graphql +https://stackoverflow.com/questions/62393655/python-creating-big-data-base-with-arrays-and-dictionary +https://stackoverflow.com/questions/62296399/need-some-advice-on-big-data-etl-job-cost-effective-design +https://stackoverflow.com/questions/62285061/how-can-i-split-a-big-data-set-to-small-tables-in-sas +https://stackoverflow.com/questions/62262935/big-data-table-mysql-query-optimization +https://stackoverflow.com/questions/62138788/requesting-an-advice-on-big-data-validation +https://stackoverflow.com/questions/62078009/get-the-sum-of-all-occurences-in-json-api-big-data +https://stackoverflow.com/questions/62079366/php-cant-write-big-data-to-csv-file +https://stackoverflow.com/questions/61792486/substitute-for-nested-for-loops-in-pandas-dataframes-for-big-data-handling +https://stackoverflow.com/questions/61770600/read-big-data300gb-quickly-in-python +https://stackoverflow.com/questions/61888946/group-by-ids-sort-by-date-and-get-values-as-list-on-big-data-python +https://stackoverflow.com/questions/61759978/best-way-for-filtering-big-data-with-qt-c +https://stackoverflow.com/questions/61778494/big-data-query-mongodb-aggregation-single-index-or-compound-index +https://stackoverflow.com/questions/61683170/how-to-optimize-filter-for-big-data-volume-postgresql +https://stackoverflow.com/questions/61506168/return-big-data-using-pymongo +https://stackoverflow.com/questions/61398736/how-to-treat-wrong-historical-data-in-big-data +https://stackoverflow.com/questions/61359956/mongodb-aggregation-on-big-data-how-to-limit-push-in-group +https://stackoverflow.com/questions/61266998/sgdclassifier-on-big-data-sparse +https://stackoverflow.com/questions/60707971/integration-of-multiple-databases-via-talend-open-studio-for-big-data +https://stackoverflow.com/questions/60753240/problems-add-update-big-data-on-postgressql +https://stackoverflow.com/questions/61199694/how-export-big-data-1mln-to-excel-file-use-only-interop-excel +https://stackoverflow.com/questions/60921645/does-anyone-know-how-i-can-work-with-big-data-in-r +https://stackoverflow.com/questions/61115819/how-to-pivot-big-data-in-python +https://stackoverflow.com/questions/61112229/speeding-up-gaussian-elimination-php-code-for-big-data +https://stackoverflow.com/questions/61093059/how-to-avoid-increasing-ldf-while-transferring-big-data +https://stackoverflow.com/questions/60975276/php-and-jquery-ajax-batch-processing-big-data +https://stackoverflow.com/questions/60949933/oculus-quest-receive-big-data-from-tcpclient +https://stackoverflow.com/questions/60902411/fuzzy-name-matching-using-big-data-in-python +https://stackoverflow.com/questions/60737988/best-practice-with-big-data-table-using-r-shiny +https://stackoverflow.com/questions/60733045/using-eloquent-laravel-to-show-countrys-levels-with-big-data +https://stackoverflow.com/questions/60618718/archive-old-data-in-mysql-and-organize-big-data +https://stackoverflow.com/questions/60680685/is-bitset-the-right-container-to-manipulate-big-data-then-move-the-results-into +https://stackoverflow.com/questions/60632849/clean-trim-vba-errors-removed-filtered-data-leaves-na-does-not-work-on-big-d +https://stackoverflow.com/questions/60595399/how-to-parallelize-computation-on-big-data-dictionary-of-lists +https://stackoverflow.com/questions/60527098/how-to-find-30-most-frequent-values-in-big-data-set +https://stackoverflow.com/questions/60465031/how-to-read-certain-sets-of-lines-from-a-big-data-file-in-python +https://stackoverflow.com/questions/59824670/how-to-calculate-row-weighted-mean-of-big-data +https://stackoverflow.com/questions/60396495/need-to-replicate-data-from-oracle-12c-based-on-partition-using-oracle-golden-ga +https://stackoverflow.com/questions/60384558/big-data-conditional-agregration +https://stackoverflow.com/questions/60363512/how-setup-big-data-tools-plugin-for-intellij-idea-to-connect-aws-zeppeling-noteb +https://stackoverflow.com/questions/60306007/python-big-data-regression +https://stackoverflow.com/questions/60241630/whats-the-most-efficient-way-to-create-a-live-dashboard-for-big-data-using-net +https://stackoverflow.com/questions/60205278/xamarin-forms-how-to-handle-big-data-in-listview +https://stackoverflow.com/questions/60189960/how-to-handle-large-yet-not-big-data-datasets +https://softwareengineering.stackexchange.com/questions/418664/handle-big-data-sets-in-a-web-application-in-combination-with-real-time-communic +https://stackoverflow.com/questions/68028206/datomic-and-the-constant-transferring-of-big-data +https://stackoverflow.com/questions/66747730/how-to-write-a-big-data-frame-in-a-txt-file +https://stackoverflow.com/questions/68964914/dynamodb-importing-big-data-with-python +https://stackoverflow.com/questions/65655892/a-way-to-load-big-data-on-python-from-sftp-server-not-using-my-hard-disk +https://stackoverflow.com/questions/68601171/how-swiftui-tabview-page-handles-big-data +https://stackoverflow.com/questions/68612841/how-to-retrieve-big-data-logs-from-cloud-aws-services +https://stackoverflow.com/questions/68505571/about-google-colab-and-other-cloud-services-for-big-data-projects +https://stackoverflow.com/questions/66058732/synapse-analytics-vs-sql-server-2019-big-data-cluster +https://stackoverflow.com/questions/66947369/how-to-efficiently-handle-big-data-in-r-for-text-mining +https://stackoverflow.com/questions/68689165/salesforce-object-describe-has-big-data-how-to-get-limited-data-like-picklist-v +https://stackoverflow.com/questions/70432346/efficient-way-to-get-the-average-of-past-x-events-within-d-days-per-each-row-in +https://stackoverflow.com/questions/70490301/laracsv-export-error-because-of-big-data +https://stackoverflow.com/questions/70478173/how-to-track-the-big-data-stored-in-gdrive-through-dvc +https://stackoverflow.com/questions/70436840/section-list-load-issue-and-scrolltolocation-issue-for-big-data-react-native +https://stackoverflow.com/questions/70422270/what-is-the-best-way-to-read-big-data-and-pd-concat +https://stackoverflow.com/questions/70396206/big-data-ways-to-calculate-sets-of-distances-in-r +https://stackoverflow.com/questions/70261850/speed-up-the-processing-time-of-for-loop-for-big-data-in-r +https://stackoverflow.com/questions/70006322/how-to-resample-downsample-the-time-series-big-data-from-10-hz-miliseconds +https://stackoverflow.com/questions/70173183/how-can-i-binding-big-data-from-vuex-with-form +https://stackoverflow.com/questions/70102671/how-to-read-a-big-data-in-c +https://stackoverflow.com/questions/69849446/why-the-nodejs-heap-out-of-memory-for-creating-excel-file-with-big-data +https://stackoverflow.com/questions/69758458/big-data-structure +https://stackoverflow.com/questions/69787453/big-data-analytics-using-spark +https://stackoverflow.com/questions/69755570/applying-paired-euclidean-distance-between-all-columns-between-two-matrices-for +https://stackoverflow.com/questions/69724988/javascript-performance-issue-with-big-data +https://stackoverflow.com/questions/69629598/use-redux-persist-instead-of-local-db-for-big-data-react-native +https://stackoverflow.com/questions/69609348/what-is-the-best-way-to-store-big-data-per-user +https://stackoverflow.com/questions/69462749/cant-transform-big-data-in-ms-ssis-with-0xc0047048-error-and-nothing-helps +https://stackoverflow.com/questions/69519352/how-to-replace-a-specific-sequence-of-numbers-per-row-with-another-sequence-in +https://stackoverflow.com/questions/69479475/how-to-send-big-data-to-api-in-laravel +https://stackoverflow.com/questions/69482046/store-big-data-with-best-searching-time +https://stackoverflow.com/questions/69348268/how-to-fasten-scatterplot-of-seaborn-when-there-is-a-big-datamany-points-to-pl +https://stackoverflow.com/questions/69356128/how-to-make-big-data-smarter-and-more-useful-through-semantic-web-approach-owl +https://stackoverflow.com/questions/69284626/big-data-manipulations-with-python +https://stackoverflow.com/questions/69091984/tool-doesnt-work-on-big-data-set-single-positional-indexer-is-out-of-bounds +https://stackoverflow.com/questions/68983852/pandas-udf-function-takes-unusually-long-to-complete-on-big-data +https://stackoverflow.com/questions/68730436/mysql-in-select-big-data-slowdown +https://stackoverflow.com/questions/68671589/how-does-the-firestore-pricing-work-by-big-data +https://stackoverflow.com/questions/68577442/how-to-read-large-sav-files-in-r-with-big-data-packages +https://stackoverflow.com/questions/68622507/react-native-flatlist-is-slow-with-dynamic-items-and-a-big-data +https://stackoverflow.com/questions/68534132/how-to-train-a-model-with-big-data-size-and-limited-memory-ram +https://stackoverflow.com/questions/68462396/better-faster-way-to-sum-ifelse-for-a-large-set-of-columns-in-a-big-data-fra +https://stackoverflow.com/questions/68386550/how-to-install-m2eclipse-to-talend-studio-for-big-data +https://stackoverflow.com/questions/67952310/class-diagram-for-big-data-batch-processing +https://stackoverflow.com/questions/68323326/concatenating-group-by-series-into-one-on-big-data +https://stackoverflow.com/questions/68223704/error-404-on-a-valid-url-because-im-passing-big-data-trought-post +https://stackoverflow.com/questions/68112626/most-efficient-way-to-write-big-data-structures-to-a-file +https://stackoverflow.com/questions/67834006/best-practices-big-data-with-mysql +https://stackoverflow.com/questions/68066157/how-to-group-search-by-time-field-in-a-big-data-table-of-pgsql +https://stackoverflow.com/questions/67898420/hdfs-is-for-big-data-storage-and-azure-storage +https://stackoverflow.com/questions/67974961/all-available-ram-was-used-in-google-colab-while-training-a-model-of-big-data +https://stackoverflow.com/questions/67884548/how-to-save-big-data-using-natife-file-system-api +https://stackoverflow.com/questions/67744517/statistical-calculus-in-big-data-set-wrong-values +https://stackoverflow.com/questions/67733526/xamarin-forms-block-ui-when-itemssource-load-a-big-data +https://stackoverflow.com/questions/67692309/processing-big-data-on-distributed-system +https://stackoverflow.com/questions/67359449/dataproc-didnt-process-big-data-in-parallel-using-pyspark +https://stackoverflow.com/questions/67505183/laravel-yajra-datatable-not-working-with-big-data +https://stackoverflow.com/questions/67323577/optimal-big-data-solution-for-aggregating-time-series-data-and-storing-results-t +https://stackoverflow.com/questions/67090860/how-do-i-match-two-different-big-data-frame-in-r +https://stackoverflow.com/questions/66992550/should-i-use-stream-to-get-big-data-from-mysql +https://stackoverflow.com/questions/66915634/xarray-where-on-netcdf-big-data +https://stackoverflow.com/questions/66910914/fastest-way-of-persisting-a-stream-of-big-data-structured-data-into-a-snowflak +https://stackoverflow.com/questions/65568588/excel-error-may-be-caused-by-pandas-writing-or-big-data-advise-needed +https://stackoverflow.com/questions/66744410/laravel-delete-big-data +https://stackoverflow.com/questions/66615614/how-to-create-many-data-frames-and-combine-them-in-one-big-data-frame-to-avoid-c +https://stackoverflow.com/questions/66613841/how-to-speed-up-a-highly-active-big-data-table-mysql +https://stackoverflow.com/questions/66593737/what-format-can-be-used-for-big-data-in-sql +https://stackoverflow.com/questions/66481824/unable-to-open-pandas-python-package-from-azure-data-studio-while-configuring-s +https://stackoverflow.com/questions/66473923/how-to-query-big-data-in-dynamodb-in-best-practice +https://stackoverflow.com/questions/66434775/should-i-use-mysql-or-firebase-with-big-data +https://stackoverflow.com/questions/66398733/what-is-the-best-way-to-work-with-big-data-in-mysql-follow-up-between-members +https://stackoverflow.com/questions/66343840/generate-big-data-in-excel-or-pdf-using-rest-api +https://stackoverflow.com/questions/66277804/result-set-takes-long-to-process-big-data-from-oracle +https://stackoverflow.com/questions/66082266/efficient-way-of-getting-big-data-from-hadoop-into-spark +https://stackoverflow.com/questions/66078412/flutter-tcp-socket-seems-to-loose-1-2-bytes-when-sending-big-data +https://stackoverflow.com/questions/65901453/mysql-longtext-filed-concat-big-data-chunks +https://stackoverflow.com/questions/65908898/flatlist-rendering-is-heavy-for-big-data-set +https://stackoverflow.com/questions/65851090/update-datagrid-row-by-row-from-a-big-data-table-progress-database-using-a-ta +https://stackoverflow.com/questions/65846053/daily-etl-job-big-data-files +https://stackoverflow.com/questions/65818059/unstack-a-big-data-table-kusto-by-timestamp-and-category +https://stackoverflow.com/questions/65800535/cant-access-webhdfs-using-big-data-europe-with-docker-compose +https://stackoverflow.com/questions/65759593/how-to-export-smaller-collection-in-mongodb-big-data-aggregations-time-out +https://stackoverflow.com/questions/65703294/how-to-clean-up-big-data-and-reshape-it-in-pandas +https://stackoverflow.com/questions/65670954/how-can-we-solve-a-two-sum-algorithm-as-a-big-data-problem-leveraging-mapreduce +https://stackoverflow.com/questions/65631236/big-data-with-angular-ui-grid-feature-grouping-selection +https://stackoverflow.com/questions/65590919/running-arithmatics-through-big-data-in-python-pandas +https://stackoverflow.com/questions/65587607/optimizing-load-of-big-data-with-javascript +https://medium.com/@shehroz1447/data-quality-with-dbt-tests-and-great-expectations-b349634089bf +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db +https://ghoshm21.medium.com/how-to-download-really-big-data-sets-for-big-data-testing-ea33b9100f09 +https://medium.com/@hugolu87/how-to-do-data-quality-testing-for-freeusing-dbt-4f0b249cd485 +https://medium.com/slalom-build/4-tips-for-data-quality-validations-with-pytest-and-pyspark-69e100fd387e +https://medium.com/openmetadata/leveraging-the-power-of-openmetadata-data-quality-framework-385ba2d8eaf +https://michael-scherding.medium.com/automating-data-quality-checks-in-google-bigquery-b84d0e1873c3 +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON +https://medium.com/@brunouy/a-guide-to-open-source-data-quality-tools-in-late-2023-f9dbadbc7948 +https://medium.com/openmetadata/why-are-we-building-a-data-quality-standard-1753fae87259 +https://medium.com/tom-harrison-jr/another-approach-to-testing-big-data-50ced2177cfb +https://urban-institute.medium.com/automating-data-quality-checks-with-great-expectations-f6b7a8e51201 +https://medium.com/towards-data-science/why-data-quality-is-harder-than-code-quality-a7ab78c9d9e +https://barrmoses.medium.com/your-data-quality-strategy-should-be-automated-heres-where-to-start-04d77c4398d2 +https://medium.com/slalom-build/the-challenge-of-testing-data-pipelines-4450744a84f1 +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63 +https://medium.com/@knoldus/what-is-big-data-testing-cc96291ba24e +https://medium.com/snowflake/how-to-ensure-data-quality-with-great-expectations-271e3ca8b4b9 +https://medium.com/insider-inc-engineering/observable-data-quality-with-elementary-and-datahub-6fa5f92f2c81 +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9 +https://medium.com/swlh/why-i-built-an-opensource-tool-for-big-data-testing-and-quality-control-182a14701e8d +https://medium.com/dataform/testing-data-quality-with-sql-assertions-2053755395e7 +https://medium.com/orchestras-data-release-pipeline-blog/advanced-snowflake-native-data-quality-testing-using-orchestra-05a2ea3b06ab +https://medium.com/@kazarmax/using-soda-core-to-check-data-quality-07b370da2df3 +https://medium.com/oceanize-geeks/big-data-testing-challenges-72be7d4d3390 +https://medium.com/@dioskurn/data-quality-test-using-machine-learning-8a9bab60533b +https://medium.com/@brunouy/the-essential-role-of-automated-tests-in-data-pipelines-bb7b81fbd21b +https://medium.com/@krupesh.desai/test-the-big-data-waters-4521d3d5fbce +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c +https://medium.com/openmetadata/simple-easy-and-efficient-data-quality-with-openmetadata-1c4e7d329364 +https://medium.com/@vlad-pasman/data-quality-with-snowflake-6bddf05aa053 +https://medium.com/@baluramachandra90/perform-your-data-quality-checks-with-a-single-line-of-code-37e6665e72e5 +https://medium.com/@geekfrosty/pydeequ-testing-data-quality-at-scale-209b674a4259 +https://medium.com/never-stop-writing/exam-diaries-what-happened-in-todays-big-data-test-c39cc7cf43b8 +https://medium.com/@mikldd/measuring-data-quality-bringing-theory-into-practice-41742e54d62f +https://medium.com/@vutrinh274/i-spent-3-hours-learning-how-uber-manages-data-quality-8ae8fa56b8d0 +https://medium.com/israeli-tech-radar/action-position-data-quality-assessment-framework-d833f6b77b7 +https://medium.com/towards-data-science/transforming-data-quality-automating-sql-testing-for-faster-smarter-analytics-6da431493570 +https://medium.com/99xtechnology/a-beginners-guide-to-big-data-testing-8db93386f35b +https://medium.com/python-in-plain-english/perform-data-quality-test-on-your-data-pipelines-with-great-expectations-bbe8f5e8816b +https://barrmoses.medium.com/data-quality-management-in-the-age-of-ai-7c85e545efd0 +https://medium.com/towards-data-science/test-your-data-until-it-hurts-306a7d7e4f84 +https://medium.com/art-of-data-engineering/data-engineering-with-a-cover-and-move-approach-to-data-quality-e564bd1f2ec5 +https://medium.com/@nydas/ensuring-data-integrity-a-data-engineers-guide-to-testing-19d266b4eb4d +https://medium.com/99p-labs/implementing-data-quality-at-scale-investigating-validation-testing-for-large-data-sets-7087928e5d3e +https://medium.com/building-ibotta/pipeline-quality-checks-circuit-breakers-and-other-validation-mechanisms-761fc5b1ebe4 +https://medium.com/@pallavisinha12/create-data-quality-framework-with-great-expectations-911b42a5312f +https://medium.com/hurb-labs/data-quality-a-lesson-from-the-myth-behind-popeye-the-sailor-a7bd50b61510 +https://medium.com/@mpchang17/my-team-won-the-2024-big-data-bowl-ca9f668d011d +https://medium.com/globant/know-your-data-better-with-great-expectations-1fffbe2ab1fa +https://jonathanlao.medium.com/omscs-big-data-for-health-informatics-7f31619d28f6 +https://medium.com/data-science-at-microsoft/partnering-for-data-quality-dc9123557f8b +https://ajithshetty28.medium.com/deequ-i-mean-data-quality-a0e6c048469d +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff +https://noahlk.medium.com/dbt-how-we-improved-our-data-quality-by-cutting-80-of-our-tests-78fc35621e4e +https://medium.com/@mariusz_kujawski/data-quality-in-google-cloud-bigquery-and-data-lake-using-great-expectations-cad5bf47f91b +https://medium.com/@crossiUX/the-problem-with-looking-at-only-big-data-808e25f87ff6 +https://roysandip.medium.com/data-validation-at-scale-with-spark-databricks-74d552b5331e +https://medium.com/astrafy/data-quality-with-great-expectations-e41504d93e17 +https://medium.com/dev-genius/how-to-integrate-data-quality-tests-in-the-python-etl-pipeline-359a535de564 +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b +https://medium.com/@Dima/big-data-checklist-1b8e3214f96 +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22 +https://medium.com/opendatadiscovery/data-quality-dashboard-9abb22bd0ee2 +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e +https://mtajchert.medium.com/how-i-used-big-data-to-pass-my-exam-75b5d7407165 +https://medium.com/orchestras-data-release-pipeline-blog/orchestration-with-data-quality-announcing-data-reconciliation-fe2fda6709ee +https://roysandip.medium.com/data-quality-with-databricks-delta-live-tables-4163ca8c8425 +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37 +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69 +https://maikpaixao.medium.com/data-quality-with-great-expectation-in-python-0908b179f615 +https://medium.com/@tigeranalytics/automated-data-quality-checks-with-deequ-using-spark-979007b89f7b +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c +https://medium.com/opendatadiscovery/enhancing-data-quality-with-opendatadiscovery-and-greatexpectations-65096fd310b2 +https://kovidrathee.medium.com/list/automation-testing-for-data-ef054605a246 +https://medium.com/@hans.knechtions/test-in-production-85224e7a82f3 +https://medium.com/openmetadata/how-to-integrate-openmetadata-test-suites-with-your-data-pipelines-d83fb55fa494 +https://medium.com/towards-data-science/we-built-an-open-source-data-quality-testframework-for-pyspark-2301b9d87127 +https://kevinkautz.medium.com/dataops-and-data-quality-67eacecd5ff9 +https://ahmed-mokbel.medium.com/how-to-use-soda-for-data-quality-checks-with-apache-airflow-cf249a737b5a +https://medium.com/bigeye/testing-vs-observability-which-is-right-for-your-data-quality-needs-1ceb34a12867 +https://medium.com/salesforce-architects/the-importance-of-data-quality-quantity-for-performance-and-scale-testing-8fabd8c6a9cf +https://medium.com/@loginradius/big-data-testing-strategy-6559d91027b7 +https://medium.com/@erkajalkumari/ensuring-data-integrity-leveraging-dagster-and-great-expectations-for-automated-data-quality-e8f4bfd06e83 +https://medium.com/@dabodie/automate-data-quality-with-an-llm-17db76049187 +https://medium.com/tiket-com/creating-a-custom-data-quality-check-on-dbt-data-build-tool-ceec919702a1 +https://medium.com/data-ops/how-data-analytics-professionals-can-sleep-better-6dedfa6daa08 +https://medium.com/@abdelbarrechafik/using-models-and-tests-with-dbt-and-databricks-ensuring-data-quality-and-accuracy-64f0004d0946 +https://medium.com/snowflake/avoid-bad-data-completely-continuous-delivery-architectures-in-the-modern-data-stack-part-2-2c4240bdb973 +https://medium.com/@hyonschu/big-data-is-dead-all-aboard-the-ai-hype-train-ae89c8d64cc3 +https://medium.com/@leonardojaxson402_59721/role-of-big-data-analytics-in-semiconductor-testing-software-b269cf424aa +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143 +https://medium.com/@lucas_dvrs/why-data-quality-is-hard-f4f58d058082 +https://medium.com/@ssharma31/advanced-data-quality-constraints-using-databricks-delta-live-tables-2880ba8a9cd7 +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76 +https://medium.com/@chandukavar/from-application-developer-to-big-data-engineer-d53f14f8c618 +https://medium.com/@iamjaswanth9/data-quality-in-snowflake-using-soda-a-complete-guide-232b7da5a3c1 +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67 +https://medium.com/data-quality-and-tools/build-quality-into-extract-transform-and-load-process-c02795ddcc93 +https://informationit27.medium.com/explain-big-data-testing-b555517f9902 +https://medium.com/@mikldd/how-to-measure-data-quality-cc3d81dd98be +https://stackoverflow.com/questions/76508030/filter-big-data-with-limit-result-in-vb-net-and-sql +https://stackoverflow.com/questions/77695454/i-am-trying-to-utilize-griddb-for-my-big-data-project-but-installation-is-stuck +https://stackoverflow.com/questions/77049167/working-with-big-data-sets-in-r-with-parquet +https://stackoverflow.com/questions/77588731/how-to-take-distinct-column-values-of-rows-from-big-data-kql-query-output +https://stackoverflow.com/questions/77525647/how-to-pass-big-data-from-a-factory-to-a-constructor-with-neither-dynamic-memory +https://stackoverflow.com/questions/77367333/how-to-limit-memory-cost-when-request-big-data-files +https://stackoverflow.com/questions/77247941/summarizing-n-grams-efficiently-in-python-on-big-data +https://stackoverflow.com/questions/77365411/to-stata-big-data-file-causing-python-to-crash +https://stackoverflow.com/questions/77345049/database-migrated-with-talend-big-data-but-there-is-a-jump-on-id +https://stackoverflow.com/questions/77005778/how-to-maintain-online-statistics-for-big-data +https://stackoverflow.com/questions/77267600/nodestream-sequelize-and-big-data +https://stackoverflow.com/questions/77250735/mysql-insert-big-data-in-5-sec +https://stackoverflow.com/questions/77233547/fetching-big-data-mapbox-api-js +https://stackoverflow.com/questions/77151109/how-to-aggregate-a-big-data-frame-by-sliding-window-along-the-rows +https://stackoverflow.com/questions/77043892/how-to-quickly-share-big-data-in-python +https://stackoverflow.com/questions/77028722/updating-or-fetching-big-data-from-mongodb +https://stackoverflow.com/questions/77024225/plotting-a-histogram-for-big-data +https://stackoverflow.com/questions/77019467/how-to-get-a-count-for-the-amount-of-columns-per-row-that-are-equal-or-greater-t +https://stackoverflow.com/questions/76990405/reactjs-loading-big-data-async-causes-bad-lighthouse-performance-rating +https://stackoverflow.com/questions/76931124/correlation-matrix-of-big-data +https://stackoverflow.com/questions/76749002/how-does-tcp-combine-data-when-sending-a-big-data-packet-which-is-over-mss +https://stackoverflow.com/questions/76637645/big-data-returns-cors-error-typeerror-failed-to-fetch-not-consuming-the-api +https://stackoverflow.com/questions/76652275/react-app-performance-issue-when-fetching-big-data +https://stackoverflow.com/questions/76561998/importing-big-data-in-a-table-for-posgtresdb-stdout-is-not-tty-stdin-is-not-tt +https://stackoverflow.com/questions/76558022/how-to-find-the-maximum-value-for-given-range-in-a-big-data-set +https://stackoverflow.com/questions/76374129/computing-persistent-homology-betti-numbers-on-big-data +https://stackoverflow.com/questions/76438296/replacing-selected-column-values-of-a-big-data-spark-dataframe-if-the-id-matches +https://stackoverflow.com/questions/76148029/querying-a-big-data-table-using-py-spark +https://stackoverflow.com/questions/76104308/randomforest-for-big-data +https://stackoverflow.com/questions/76103457/variable-selection-in-big-data +https://stackoverflow.com/questions/75946787/data-analytics-on-a-map-for-big-data-using-mapbox +https://stackoverflow.com/questions/75945165/whats-the-best-algorithm-to-move-big-data-between-two-databases +https://stackoverflow.com/questions/75941261/fastest-way-to-get-big-data-from-warehouse-to-server +https://stackoverflow.com/questions/75834201/how-to-make-a-scatter-plot-in-r-with-a-big-data-frame +https://stackoverflow.com/questions/75834497/transpose-with-multiple-criteria-big-data-set +https://stackoverflow.com/questions/75703227/moving-big-data-from-table-storage-into-something-more-queryable +https://stackoverflow.com/questions/75816145/while-loop-error-which-only-occurs-with-a-big-data-frame +https://stackoverflow.com/questions/75797834/send-very-big-data-to-an-api-in-parallel-and-catching-errors-within-promise-alls +https://stackoverflow.com/questions/75752574/optimal-approach-for-displaying-big-data-tables-in-a-template +https://stackoverflow.com/questions/75697603/what-will-happened-if-we-insert-extremely-big-data-into-query-parameter +https://stackoverflow.com/questions/75455730/incremental-powertransformation-on-big-data +https://stackoverflow.com/questions/75404296/how-to-run-dirichlet-regression-with-a-big-data-set-in-r +https://stackoverflow.com/questions/75400350/how-to-upload-big-data-to-mongodb +https://stackoverflow.com/questions/75359882/multiprocessing-crashes-on-big-data-oserror-errno-24-too-many-open-files +https://stackoverflow.com/questions/75141934/redash-query-join-with-another-query-have-big-data +https://stackoverflow.com/questions/75042068/how-to-compare-the-list-map-of-custom-objects-field-by-field-to-create-mismatch +https://stackoverflow.com/questions/70718209/workaround-for-ggplot2facet-grid-big-data-bug +https://stackoverflow.com/questions/73823770/how-to-define-keystore-for-kafka-in-big-data-tool-connections-idea-plugin +https://stackoverflow.com/questions/73239645/improving-time-efficiency-of-code-working-with-a-big-data-set-using-python +https://stackoverflow.com/questions/74917981/how-to-upload-big-data-from-two-microservices-at-once +https://stackoverflow.com/questions/74829692/how-do-i-reduce-the-run-time-for-big-data-pyspark-scripts +https://stackoverflow.com/questions/74804741/i-am-working-with-nfl-positional-data-provided-for-the-2022-nfl-big-data-bowl-an +https://stackoverflow.com/questions/74798114/how-to-fetch-big-data-in-vue +https://stackoverflow.com/questions/74754816/how-to-create-a-big-data-frame-from-a-function-with-few-continuous-vectors +https://stackoverflow.com/questions/74559587/command-working-for-small-data-but-not-for-big-data +https://stackoverflow.com/questions/74500537/how-can-i-use-multiprocess-when-processing-big-data-with-python +https://stackoverflow.com/questions/74428163/big-data-batch-and-stream-data-pipeline-with-hadoop-spark +https://stackoverflow.com/questions/74389753/export-big-data-from-oracle-db-to-bcp-file +https://stackoverflow.com/questions/74358537/pyspark-giving-incorrect-result-on-rank-for-big-data +https://stackoverflow.com/questions/74281750/why-does-python-index-error-for-big-data +https://stackoverflow.com/questions/74203757/talend-big-data-streaming-not-supporting-subjob +https://stackoverflow.com/questions/74142721/combine-big-data-stored-in-subdirectories-as-100-000-csv-files-of-size-200-gb-w +https://stackoverflow.com/questions/74020975/is-there-any-way-to-increase-heap-size-in-weka-3-7-13-for-executing-the-big-data +https://stackoverflow.com/questions/73991036/how-to-pass-a-big-data-object-to-another-page-with-dynamic-route-in-next-js-wit +https://stackoverflow.com/questions/73987388/mongodb-big-data-processing-takes-huge-amount-of-time +https://stackoverflow.com/questions/73844466/why-is-non-zeroed-memory-only-a-problem-with-big-data-usage +https://stackoverflow.com/questions/73826839/pyspark-big-data-question-how-to-add-column-from-another-dataframe-no-common +https://stackoverflow.com/questions/73666523/mongodb-is-too-slow-on-selecting-big-data +https://stackoverflow.com/questions/73635948/datatables-export-all-to-excel-server-side-big-data-oracle +https://stackoverflow.com/questions/73627847/big-data-in-uipageviewcontroller-cause-problem-to-the-performance +https://stackoverflow.com/questions/73623028/interpolation-of-big-data-sets-interp1d-with-timestamps-python +https://stackoverflow.com/questions/73447132/sql-snowflake-take-out-big-data +https://stackoverflow.com/questions/73414391/parsing-text-file-with-python-taking-only-the-important-data-from-a-big-data-an +https://stackoverflow.com/questions/73283522/miceforest-imputation-based-on-groupby-on-big-data +https://stackoverflow.com/questions/73274450/big-data-in-tableview +https://stackoverflow.com/questions/73251309/how-to-feed-big-data-into-pipeline-of-huggingface-for-inference +https://stackoverflow.com/questions/73184424/selecting-more-than-two-groups-from-a-big-data-frame-for-correlation-and-plottin +https://stackoverflow.com/questions/73033646/issue-loading-big-data-using-apache-spark-connector-for-sql-server-to-azure-sql +https://stackoverflow.com/questions/72970343/plotting-top-10-values-in-big-data +https://stackoverflow.com/questions/72962982/continuously-changing-big-data-and-c +https://stackoverflow.com/questions/72963109/telerikgrid-in-blazor-filter-is-taking-to-much-time-for-big-data-set +https://stackoverflow.com/questions/72959538/caching-for-big-data-queried-via-flask-and-celery +https://stackoverflow.com/questions/72914084/historical-big-data-slow-queries +https://stackoverflow.com/questions/72813642/plotting-rows-and-columns-of-big-data-in-an-interpretable-way +https://stackoverflow.com/questions/72775687/saving-big-data-in-csv-file +https://stackoverflow.com/questions/72732558/transposing-a-big-data-file-in-one-line-python-unix +https://stackoverflow.com/questions/72677806/how-to-statically-typize-a-big-data-objects-in-java +https://stackoverflow.com/questions/72733255/big-data-dataframe-from-an-on-disk-mem-mapped-binary-struct-format-from-python +https://stackoverflow.com/questions/72685833/how-to-handle-big-data-json-having-more-than-32767-keys +https://stackoverflow.com/questions/72582293/order-of-installing-big-data-modules-on-ubuntu +https://stackoverflow.com/questions/72580546/how-can-i-add-a-new-column-based-on-two-dataframes-and-conditions-for-big-data +https://stackoverflow.com/questions/72573602/avoid-big-data-in-audit-logs-with-sqlalchemy +https://stackoverflow.com/questions/72565218/proportional-allocation-sampling-using-dplyr-package-in-r-for-big-data-frame +https://stackoverflow.com/questions/72463190/how-to-concatenate-strings-from-using-groupby-in-big-data-frames +https://stackoverflow.com/questions/72455435/flatlist-big-data-renderitem-is-called-for-every-elements +https://stackoverflow.com/questions/72151225/polymorphic-data-transformation-techniques-data-lake-big-data +https://stackoverflow.com/questions/71930333/splitting-up-a-big-data-frame-into-smaller-subset-column-wise +https://stackoverflow.com/questions/71834909/replace-the-values-of-the-big-data-frame-with-another-values +https://stackoverflow.com/questions/71756911/big-data-scatterplot-adding-lines +https://stackoverflow.com/questions/71575120/big-data-problems-scaling-up-from-sub-sample-to-full-set-taking-forever-using-g +https://stackoverflow.com/questions/71574974/reshaping-big-data-long-based-on-column-name-patterns +https://stackoverflow.com/questions/71382552/ways-to-improve-method-for-calculating-sets-of-distances-in-big-data +https://stackoverflow.com/questions/71567382/serilog-c-how-to-prevent-logging-big-data-e-g-image-data-or-large-json-object +https://stackoverflow.com/questions/71567981/creating-a-boxplot-with-matplotlib-for-big-data +https://stackoverflow.com/questions/71492508/ram-overflow-and-long-loading-times-sql-query-big-data +https://stackoverflow.com/questions/71370643/how-to-read-a-big-data-50g-from-memory-rather-than-local-disk-in-python +https://stackoverflow.com/questions/71368486/im-trying-to-remove-duplicate-from-big-data4919214-2-but-got-this-error +https://stackoverflow.com/questions/71170710/how-to-circumvent-spice-limitations-500-m-rows-to-create-a-quicksight-dashboar +https://stackoverflow.com/questions/70958817/getting-big-data-through-signalr-blazor +https://stackoverflow.com/questions/71036944/is-dc-js-used-with-crossfilter-and-d3-js-still-a-good-option-for-big-data-visu +https://stackoverflow.com/questions/71074303/networkx-problem-while-working-big-data +https://stackoverflow.com/questions/71035982/wget-with-big-data-file-straight-to-s3 +https://stackoverflow.com/questions/71010264/flatlist-is-very-slow-in-using-big-data-in-react-native +https://stackoverflow.com/questions/70985029/get-big-data-from-api-through-postman-got-error-sort-exceeded-memory-limit-of +https://stackoverflow.com/questions/70981562/how-to-connect-sql-server-bdc-big-data-cluster-from-oracle-enviornment +https://stackoverflow.com/questions/70902290/what-is-the-meaning-of-big-data-in-sense-the-limit-or-the-range-beyond-which-ca +https://stackoverflow.com/questions/70840513/converting-character-to-hms-big-data +https://stackoverflow.com/questions/70699341/how-can-i-insert-my-big-data-in-html-on-chunks +https://stackoverflow.com/questions/70571778/tsqlt-assertequalstable-takes-hours-to-complete-when-big-data-set-involves +https://stackoverflow.com/questions/70568605/fgets-vs-getc-with-big-data +https://stackoverflow.com/questions/70551621/big-data-in-pytorch-help-for-tuning-steps +https://www.linkedin.com/pulse/testing-tableau-i-like-big-data-cannot-lie-sarah-richey +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality +https://www.linkedin.com/posts/kevinzenghu_dataengineering-dataquality-analytics-activity-7211081770077679620-ZQGB +https://www.linkedin.com/posts/davebkaplan_data-warehouse-testing-strategies-for-better-activity-7067445009863581696-iYWl +https://www.linkedin.com/advice/3/how-do-you-create-data-quality-test-plan-skills-data-quality +https://www.linkedin.com/posts/nicole-janeway-bills_exam-debrief-on-the-data-quality-specialist-activity-7249150004441792512-2pRK +https://www.linkedin.com/pulse/perils-big-data-gary-lineker-test-dr-chris-donegan +https://www.linkedin.com/pulse/big-data-testing-qa-touch +https://www.linkedin.com/pulse/tale-data-engineer-small-big-enterprise-strategy-testing-munir +https://www.linkedin.com/posts/johncsteiner_putting-fleet-data-quality-to-test-for-fleet-activity-7258237389540589568-pux7 +https://www.linkedin.com/pulse/software-testing-challenges-big-data-rahul-malhotra +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory +https://www.linkedin.com/pulse/power-dna-self-testing-big-data-sven-a-jensen +https://www.linkedin.com/posts/datafold_what-is-proactive-data-quality-testing-3-activity-7201971592669655040-SUHw +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle +https://www.linkedin.com/pulse/data-quality-validations-ai-generated-part-ii-gen-ai-hemachandran +https://www.linkedin.com/pulse/data-quality-does-equal-documentation-5-super-test-your-bellehumeur/ +https://www.linkedin.com/pulse/predictive-analytics-big-datathe-test-book-william-kevin-furlow +https://www.linkedin.com/pulse/streamlining-data-complexities-big-testing-case-study-apptestify-6fddf +https://www.linkedin.com/pulse/how-test-your-big-idea-without-data-rick-harris +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e +https://www.linkedin.com/posts/cdiggins_here-is-a-test-where-i-am-sharing-a-very-activity-7257822348211367937-rkRc +https://www.linkedin.com/pulse/what-i-learned-from-executing-data-quality-projects-david-finlay +https://www.linkedin.com/advice/0/how-do-you-test-data-quality-across-formats-skills-data-engineering +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov +https://www.linkedin.com/pulse/big-data-testing-market-size-status-forecast-2024-2031-kjqtc +https://www.linkedin.com/posts/yuzhengsun_data-quality-is-a-challenge-for-most-companies-activity-7226336798593970176-nQiB +https://www.linkedin.com/pulse/big-data-hadoop-testing-pooja-chougule-1 +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus- +https://www.linkedin.com/pulse/journey-from-big-data-smart-how-testing-can-help-you-debjani-goswami?trk=public_post +https://www.linkedin.com/advice/0/what-best-methods-measuring-data-quality-testing +https://www.linkedin.com/pulse/data-quality-gates-testing-our-we-create-next-insurance-engineering +https://www.linkedin.com/pulse/effective-big-data-test-ritesh-garg +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair +https://www.linkedin.com/posts/audralawson_hallucination-or-vision-establishing-reliability-activity-7255911178894237696-TdSM +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy +https://www.linkedin.com/pulse/2016-predictions-security-devops-big-data-mobile-testing-ulf-mattsson +https://www.linkedin.com/pulse/testing-big-data-gagan-mehra +https://www.linkedin.com/learning/data-science-foundations-data-engineering/data-quality-testing +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment +https://www.linkedin.com/advice/0/what-best-tools-methods-data-quality-assessment +https://www.linkedin.com/pulse/big-data-warehouse-testing-nigel-shaw +https://www.linkedin.com/advice/0/how-do-you-test-evaluate-data-quality-variations +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f +https://www.linkedin.com/pulse/take-tom-davenports-big-data-challenge-tom-davenport +https://www.linkedin.com/pulse/why-big-data-testing-important-healthcare-industry-debjani-goswami +https://www.linkedin.com/pulse/big-data-testing-bukky-olawoyin +https://www.linkedin.com/posts/wim-hardyns-7250a129_kick-off-big-data-policing-field-test-activity-7167921293525266432-F0oR +https://www.linkedin.com/pulse/elevating-data-quality-using-open-source-dbt-core-test-vetriselvan-sroyc +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-governance-management +https://www.linkedin.com/pulse/road-map-from-non-tech-big-data-testing-krishna-kayaking +https://www.linkedin.com/pulse/data-quality-testing-grant-brodie +https://www.linkedin.com/jobs/view/glm-data-quality-testing-manager-at-bank-of-america-4022477308 +https://www.linkedin.com/pulse/minesweeper-big-data-testing-roadmap-cognitive-chapter-shay-cohen +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla +https://www.linkedin.com/pulse/key-considerations-big-data-application-testing-taukir-hasan +https://www.linkedin.com/pulse/63-dimensions-data-quality-assessment-ankur-gupta +https://www.linkedin.com/pulse/aligning-big-data-application-testing-charles-richter +https://www.linkedin.com/pulse/big-data-bad-primer-testing-alex-rodov +https://www.linkedin.com/pulse/automation-data-quality-testing-marina-veber-cfa +https://www.linkedin.com/advice/1/what-steps-effective-data-quality-testing-skills-data-warehousing-ka8kc +https://www.linkedin.com/advice/3/what-your-data-quality-testing-objectives-skills-data-quality +https://www.linkedin.com/pulse/candidate-personality-testing-collision-big-data-danielle-larocca +https://www.linkedin.com/pulse/day-15-dbt-testing-ensuring-data-quality-built-in-tests-surya-ambati-wfawc +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-lancesoft-inc-4079696369 +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri +https://www.linkedin.com/jobs/view/data-quality-test-engineer-at-a-united-pakistan-4084135437 +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye +https://www.linkedin.com/pulse/crash-test-your-business-model-disruptive-big-data-tesla-danner +https://www.linkedin.com/posts/bianciniandrea_big-data-test-infrastructure-bdti-activity-7249798888075804672-EAs5 +https://www.linkedin.com/posts/exafluence_automatedtesting-exceptionmanagement-datainmotion-activity-7108127354903793664-2HWf +https://www.linkedin.com/pulse/elevate-your-central-location-tests-clt-mastering-data-gabriel-velez-oogve?trk=article-ssr-frontend-pulse_more-articles_related-content-card +https://www.linkedin.com/posts/mecanica-scientific-svcs-corp_putting-fleet-data-quality-to-test-for-fleet-activity-7258216521196285952-k_O1 +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki +https://www.linkedin.com/advice/1/how-do-you-test-data-quality-after-cleaning-skills-data-analytics +https://www.linkedin.com/jobs/test-engineer-aws-big-data-jobs-tempe-az +https://www.linkedin.com/advice/0/what-best-way-test-security-privacy-big-data-applications-wjouc +https://www.linkedin.com/pulse/big-data-new-technologies-enhance-genetic-testing-christopher-colucci +https://www.linkedin.com/advice/3/how-do-you-define-test-data-quality-coverage-criteria +https://www.linkedin.com/pulse/emulating-big-data-rainmakers-test-massive-disaster-warnings-xavier +https://www.linkedin.com/pulse/tools-technologies-data-quality-management-robert-seltzer-o29tc +https://www.linkedin.com/pulse/my-path-learn-big-data-pass-aws-certified-analytics-specialty-yin +https://www.linkedin.com/pulse/using-models-tests-dbt-databricks-ensuring-data-quality-chafik +https://www.linkedin.com/pulse/big-data-testing-nabarun-purkayastha +https://www.linkedin.com/pulse/kibikibana-chembl-test-smoothest-way-life-sciences-big-tummarello +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7221835281807683584-aEot +https://www.linkedin.com/pulse/big-data-psa-test-market-rapid-growth-amid-challenges-sbsff?trk=public_post_main-feed-card_feed-article-content +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4064853325 +https://www.linkedin.com/jobs/view/test-lead-data-quality-engineering-p-c-insurance-at-sogeti-4076545953 +https://www.linkedin.com/posts/gilbertbenghiat_data-observability-and-data-quality-testing-activity-7197513747605716992-beFj +https://www.linkedin.com/pulse/big-data-testing-market-2024-detailed-mqtdf +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view +https://www.linkedin.com/advice/0/how-do-you-use-user-feedback-test-data-quality +https://www.linkedin.com/advice/3/what-best-ways-test-big-data-analytics-project-results-tnd0f +https://stackoverflow.com/questions/79133995/problem-with-assigning-new-ids-in-big-data-frames-for-long-data-in-r +https://stackoverflow.com/questions/78041617/how-to-properly-optimize-spark-and-milvus-to-handle-big-data +https://stackoverflow.com/questions/79021943/how-to-split-and-store-big-data-reports +https://stackoverflow.com/questions/78947494/how-to-export-data-into-several-flat-files-using-informatica-developer-big-data +https://stackoverflow.com/questions/78290693/how-to-json-formatted-big-data-send-to-gemini-to-ask-for-analysis +https://stackoverflow.com/questions/78847629/can-azure-ai-search-retrieve-all-the-sql-table-records-index-from-big-data +https://stackoverflow.com/questions/78013768/is-it-a-good-idea-to-write-big-data-trough-trino +https://stackoverflow.com/questions/78834805/storing-big-data1000-lines-per-second-and-reading-in-realtime-in-c +https://stackoverflow.com/questions/78824419/ruby-sidekiq-best-solution-for-execute-and-handle-big-data +https://stackoverflow.com/questions/78516150/how-to-use-mongodb-aggregation-pipeline-for-real-time-analytics-on-sharded-clust +https://stackoverflow.com/questions/78771511/big-data-to-implement-inverted-search-index +https://stackoverflow.com/questions/78528765/how-should-i-write-elasticsearch-search-querys-when-dealing-with-big-data +https://stackoverflow.com/questions/78551755/loading-analyzing-big-data-from-a-csv-in-r +https://stackoverflow.com/questions/78509755/how-can-filter-and-retrieve-specific-records-from-big-data-efficiently-using-pyt +https://stackoverflow.com/questions/78240971/ibis-vs-spark-for-big-data-processing-against-an-analytics-datawarehouse-with-a +https://stackoverflow.com/questions/78499951/nuxt-js-axios-send-big-data-from-laravel-back +https://stackoverflow.com/questions/78460850/patch-creation-methods-for-deep-learning-on-very-big-data-with-relatively-low-am +https://stackoverflow.com/questions/78457050/development-of-a-gis-choice-of-database-and-considerations-of-scalability-and-b +https://stackoverflow.com/questions/78391530/best-practice-to-preserve-the-big-data-for-table +https://stackoverflow.com/questions/77793446/jetpack-compose-dropdownmenu-for-big-data +https://stackoverflow.com/questions/78389336/how-to-compute-new-variables-out-of-items-using-rowmeans-function-in-a-loop-func +https://stackoverflow.com/questions/78379372/datatable-big-data-around-40k-takes-too-long-to-filter +https://stackoverflow.com/questions/78372734/how-to-use-async-filter-with-big-data +https://stackoverflow.com/questions/78319772/why-do-shared-memory-segments-run-longer-than-pipe-when-transferring-big-data +https://stackoverflow.com/questions/78323388/ingestion-av-big-data-sets-in-azure-for-datawarehouse +https://stackoverflow.com/questions/78321117/pyspark-for-big-data-analytics-assertion-error-facing-issues-converting-string +https://stackoverflow.com/questions/78319022/how-to-handle-big-data-from-slack-messages +https://stackoverflow.com/questions/78273303/issues-in-data-anonymisation-for-a-big-data-coursework-assignment +https://stackoverflow.com/questions/78253070/how-to-make-an-r-shiny-app-with-big-data +https://stackoverflow.com/questions/77991341/how-to-import-big-data-of-dat-format-in-a-fast-way +https://stackoverflow.com/questions/78082219/how-to-continuously-save-locally-big-data-from-tick-by-tick-streaming-without-ov +https://stackoverflow.com/questions/78147819/how-to-use-multiprocessing-in-python-with-big-data +https://stackoverflow.com/questions/78088115/pyspark-vs-sqlalchemy-which-is-better-for-dealing-with-big-data +https://stackoverflow.com/questions/78072497/how-identify-rows-in-big-data-frame-that-match-rows-in-little-data-frame +https://stackoverflow.com/questions/78028513/how-vespa-addresses-memory-limitations-in-big-data-applications +https://stackoverflow.com/questions/77954050/count-query-help-for-big-data-with-join-to-jsonb-column +https://stackoverflow.com/questions/77967983/how-to-simplify-a-creation-of-a-big-data +https://stackoverflow.com/questions/77884817/check-how-many-rows-add-up-to-a-number-check-inventory-coverage-days-in-panda +https://stackoverflow.com/questions/77875648/wordpress-big-data-handling-tools +https://stackoverflow.com/questions/77756650/how-to-export-pyspark-big-data-to-xls-or-csv +https://stackoverflow.com/questions/28236897/replace-outliers-from-big-data +https://stackoverflow.com/questions/37744728/kendo-ui-grid-grouping-and-paging-with-big-data +https://stackoverflow.com/questions/53986502/confusion-between-operational-and-analytical-big-data-and-on-which-category-hado +https://stackoverflow.com/questions/21527307/common-large-pst-files-to-test-big-data +https://stackoverflow.com/questions/43524694/where-does-big-data-go-and-how-is-it-stored +https://stackoverflow.com/questions/57535626/low-rendering-with-the-big-data-in-teechart-pro-vcl +https://stackoverflow.com/questions/46892773/big-data-generalized-linear-mixed-effects-models +https://stackoverflow.com/questions/36930860/how-to-optimise-handle-of-big-data-on-laravel +https://stackoverflow.com/questions/24262041/how-to-send-big-data-via-signalr-in-net-client +https://stackoverflow.com/questions/24841142/how-can-i-generate-big-data-sample-for-postgresql-using-generate-series-and-rand +https://stackoverflow.com/questions/52390028/is-data-lake-and-big-data-the-same +https://stackoverflow.com/questions/35616003/how-to-make-sap-lumira-desktop-not-import-big-data +https://stackoverflow.com/questions/34968832/best-way-to-store-big-data-in-swift +https://stackoverflow.com/questions/35560823/what-is-big-data-what-classifies-as-big-data +https://stackoverflow.com/questions/57464172/how-to-load-in-big-data-sets-with-st-read-without-exceeding-ram +https://stackoverflow.com/questions/58868031/how-machine-learning-intgreate-with-big-data +https://stackoverflow.com/questions/47921826/learning-big-data-for-a-real-case +https://stackoverflow.com/questions/44704465/pandas-df-groupby-is-too-slow-for-big-data-set-any-alternatives-methods +https://stackoverflow.com/questions/56740580/merge-multiple-files-into-one-big-data-table-column-names-do-not-match-in-the-f +https://stackoverflow.com/questions/47533766/what-is-the-difference-between-a-big-data-warehouse-and-a-traditional-data-wareh +https://stackoverflow.com/questions/47902776/high-performance-way-to-find-duplicated-rows-using-dplyr-on-big-data-set +https://stackoverflow.com/questions/52090453/how-to-improve-my-tables-and-queries-for-big-data-applications +https://stackoverflow.com/questions/48997676/error-message-for-processing-big-data +https://stackoverflow.com/questions/28066955/what-server-do-i-need-for-big-data-100gb-of-plain-text +https://stackoverflow.com/questions/46678720/pros-and-cons-of-big-data-and-small-data +https://stackoverflow.com/questions/22344707/primefaces-dataexporter-for-big-data +https://stackoverflow.com/questions/57341395/how-to-avoid-big-data-problem-when-dealing-nii-gz +https://stackoverflow.com/questions/47284485/python-code-performance-on-big-data-os-path-getsize +https://stackoverflow.com/questions/34941410/fetchfailedexception-or-metadatafetchfailedexception-when-processing-big-data-se +https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data +https://stackoverflow.com/questions/21160153/how-to-effectively-write-big-data-structure-to-file +https://stackoverflow.com/questions/56248555/unix-perl-python-substitute-list-on-big-data-set +https://stackoverflow.com/questions/54232066/big-data-load-in-pandas-data-frame +https://stackoverflow.com/questions/43585974/how-to-show-big-data-chart-with-good-performace +https://stackoverflow.com/questions/49438954/python-shared-memory-dictionary-for-mapping-big-data +https://stackoverflow.com/questions/51487769/how-to-insert-big-data-on-the-laravel +https://stackoverflow.com/questions/34065362/php-mysql-select-from-big-data +https://stackoverflow.com/questions/30688887/big-data-with-spatial-queries-indexing +https://stackoverflow.com/questions/51841091/importing-big-data-from-application-insights-to-powerbi +https://stackoverflow.com/questions/56041339/how-to-skip-duplicate-headers-in-multiple-csv-files-having-indetical-columns-and +https://stackoverflow.com/questions/53201858/how-to-persist-sensor-telemetry-data-into-cold-storage-such-as-big-data-storage +https://stackoverflow.com/questions/57672325/error-3-after-open-dataset-if-big-data-volume-is-processed-none-otherwise +https://stackoverflow.com/questions/21868369/pycharm-hanging-for-a-long-time-in-ipython-console-with-big-data +https://stackoverflow.com/questions/44502825/performance-testing-on-big-data +https://stackoverflow.com/questions/55292664/get-data-in-the-last-three-months-using-talend-big-data-hive +https://stackoverflow.com/questions/58314908/how-to-start-learning-big-data-what-are-the-modules-i-need-to-concentrate-on-as +https://stackoverflow.com/questions/31162894/how-to-create-big-data-project +https://stackoverflow.com/questions/44054061/what-is-3g-4g-of-big-data-mean-and-the-different +https://stackoverflow.com/questions/51889466/how-to-analyze-the-relationship-between-multiple-inputs-and-multiple-outputs-thr +https://stackoverflow.com/questions/52298007/is-spa-solution-proper-for-developing-an-big-data-approach-applications +https://stackoverflow.com/questions/36386361/how-to-receive-big-data-with-recv-function-using-c +https://stackoverflow.com/questions/56563626/combining-big-data-files-with-different-columns-into-one-big-file +https://stackoverflow.com/questions/57262225/how-to-access-individual-time-sample-of-nii-nifti-format-without-loading-fmri +https://stackoverflow.com/questions/59268599/how-to-cope-with-case-sensitive-column-names-in-big-data-file-formats-and-extern +https://stackoverflow.com/questions/50677597/what-does-big-data-have-to-do-with-cloud-computing +https://stackoverflow.com/questions/59427149/design-data-provisioning-strategy-for-big-data-system +https://stackoverflow.com/questions/32458713/compare-two-big-data-20-million-products +https://stackoverflow.com/questions/59530542/how-to-exclude-few-columns-and-replace-negative-values-in-big-data +https://stackoverflow.com/questions/59473878/error-in-angular-material-tree-when-displaying-big-data +https://stackoverflow.com/questions/41979781/asp-net-301-redirect-for-big-data +https://stackoverflow.com/questions/59456842/will-polymorphic-relation-cause-slowness-on-big-data +https://stackoverflow.com/questions/57082468/slow-first-read-big-data-in-realms +https://stackoverflow.com/questions/59456956/caching-big-data-in-net-core-web-api +https://stackoverflow.com/questions/59303786/how-to-iterate-a-thiveinput-in-a-talend-big-data-job +https://stackoverflow.com/questions/59189382/solutions-for-big-data-preprecessing-for-feeding-deep-neural-network-models-buil +https://stackoverflow.com/questions/58236374/big-data-database-on-top-of-openstack-swift +https://stackoverflow.com/questions/34521726/does-downsampling-of-big-data-in-python-bokeh-server-work-where-documented +https://stackoverflow.com/questions/31275867/can-bdd-work-for-big-data-etl-testing +https://stackoverflow.com/questions/48373636/big-data-in-datalab +https://stackoverflow.com/questions/58725538/do-we-visualize-big-data +https://stackoverflow.com/questions/58712147/res-write-not-sending-big-data-until-res-end-is-called-after-res-write-but-don +https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python +https://stackoverflow.com/questions/58577664/how-to-merge-big-data-of-csv-files-column-wise-into-a-single-csv-file-using-pand +https://stackoverflow.com/questions/58567273/how-to-cluster-big-data-using-python-or-r-without-memory-error +https://stackoverflow.com/questions/58575993/how-to-pull-big-data-with-jparepository +https://stackoverflow.com/questions/58570251/how-to-set-index-while-have-only-one-column-in-big-data-using-pandas +https://stackoverflow.com/questions/58568890/how-to-set-first-full-row-as-a-index-in-big-data-using-pandas +https://stackoverflow.com/questions/58014136/query-optimization-for-big-data-database +https://stackoverflow.com/questions/58406433/filter-array-from-big-data-collection-of-data +https://stackoverflow.com/questions/26156646/which-one-is-best-csv-or-json-in-order-to-import-big-data-php +https://stackoverflow.com/questions/58362241/is-my-big-data-framework-setup-complete-or-have-i-missed-something-crucial +https://stackoverflow.com/questions/49655984/azure-data-factory-failed-while-copying-big-data-files +https://stackoverflow.com/questions/58308006/big-data-load-in-salesforce +https://stackoverflow.com/questions/58306030/is-there-a-methodology-and-a-well-stablished-library-for-data-visualization-in-b +https://stackoverflow.com/questions/58274327/sql-server-big-data-replication-primary-key +https://stackoverflow.com/questions/43657979/running-a-website-web-application-that-analyzes-big-data +https://stackoverflow.com/questions/57879362/angular-filter-big-data-set-best-practices +https://stackoverflow.com/questions/58158135/what-do-people-mean-by-intermediate-results-when-talking-about-hadoop-spark +https://stackoverflow.com/questions/58130854/laravel-pass-big-data-through-a-view-load-time-slow +https://stackoverflow.com/questions/58038346/whats-the-best-practice-to-fetch-specific-fields-from-big-data-coming-from-rest +https://stackoverflow.com/questions/57969048/is-it-possible-to-simulate-big-data-flow-on-mongo-db +https://stackoverflow.com/questions/57968484/how-to-solve-java-net-socketexception-connection-reset-by-peer-socket-write-e +https://stackoverflow.com/questions/34043395/php-amazon-sqs-big-data +https://stackoverflow.com/questions/57930752/hash-string-to-be-sortable-big-data +https://stackoverflow.com/questions/57811076/loading-big-data-to-elasticsearch-and-kibana +https://stackoverflow.com/questions/57780324/optimize-a-having-count-distinct-query-for-big-data +https://stackoverflow.com/questions/57679012/find-outliers-without-loading-big-data +https://stackoverflow.com/questions/57614356/using-on-disk-cache-for-big-data-gigabytes-with-spring-cache-abstraction +https://stackoverflow.com/questions/57585469/using-pandas-how-to-use-column-data-for-statistics-analysis-for-big-data +https://stackoverflow.com/questions/57558129/sending-large-big-data-in-mpi-java-openmpi +https://softwareengineering.stackexchange.com/questions/387335/designing-a-big-data-web-app +https://softwareengineering.stackexchange.com/questions/342176/is-this-big-data-architecture-good-enough-to-handle-many-requests-per-second +https://softwareengineering.stackexchange.com/questions/340687/reading-and-saving-big-data-to-db +https://softwareengineering.stackexchange.com/questions/327667/srp-in-the-big-data-setting +https://softwareengineering.stackexchange.com/questions/303515/dealing-with-big-data +https://softwareengineering.stackexchange.com/questions/272872/can-fluent-dsls-exist-in-big-data-environments +https://softwareengineering.stackexchange.com/questions/270031/efficiently-save-big-data-structures +https://softwareengineering.stackexchange.com/questions/230150/big-data-can-it-be-pre-processed +https://sqa.stackexchange.com/questions/37718/big-data-application-testing diff --git a/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv b/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv new file mode 100644 index 0000000..6c44a2e --- /dev/null +++ b/docs_to_import/mrs_oliveira2025/cleaned_posts_with_test_tools_and_methods (1).csv @@ -0,0 +1,71 @@ +link,ferramentas,metodo +https://dev.to/kirekov/apache-spark-hive-and-spring-boot-testing-guide-mdp,"JUnit, JUnit 5, JUnit, Jest",Integration Testing +https://dev.to/carlosgonzagabsb/e-possivel-obter-100-de-automacao-de-testes-1h22,,Exploratory Testing +https://dev.to/keploy/test-data-management-a-comprehensive-guide-5730,Selenium, +https://dev.to/panasenco/test-driven-development-for-analytics-engineering-3nlo,,Test-Driven Development +https://dev.to/lambdatest/ensuring-quality-in-data-ai-a-comprehensive-approach-to-quality-in-the-age-of-data-ai-testm-2023-2phi,Selenium, +https://dev.to/mbogan/using-datafold-to-enhance-dbt-for-data-observability-3cbl,,Regression Testing +https://dev.to/sudo_pradip/dbt-and-software-engineering-4006,,"Regression Testing, Unit Testing, Acceptance Testing" +https://dev.to/a1qa_testing/how-to-ensure-the-quality-of-smart-contracts-in-decentralized-applications-1j9a,Jest,"Behavior-Driven Development, Integration Testing, Load Testing" +https://dev.to/m1pko/data-quality-technical-debt-from-hell,,Regression Testing +https://dev.to/sureshayyanna/learn-sql-in-7-days-sdet-5hc8,Cucumber,Test-Driven Development +https://dev.to/yashbansal651/23-software-testing-trends-to-look-out-for-in-2023-2lcf,"Selenium, Appium",Regression Testing +https://dev.to/elthrasher/mocking-aws-with-jest-and-typescript-199i,"Mockito, Jest","Unit Testing, Integration Testing" +https://dev.to/locally/spatial-big-data-systems-a-retrospective-13oa,Selenium, +https://dev.to/gewenyu99/what-we-learned-from-analyzing-202-million-ci-jobs-in-trunk-flaky-tests-part-2-1363,"JUnit, JUnit", +https://dev.to/adityabhuyan/ai-powered-software-testing-unlocking-benefits-for-large-scale-projects-59ja,,Regression Testing +https://dev.to/berthaw82414312/test-automation-frameworks-key-to-effective-and-efficient-software-testing-4bin,"Selenium, Cucumber, Appium","Regression Testing, Unit Testing, Integration Testing" +https://dev.to/aws-builders/my-journey-into-the-cloud-getting-aws-certified-323c,,Smoke Testing +https://dev.to/kayis/what-are-the-alternatives-to-unit-tests-2jii,,"Unit Testing, Integration Testing" +https://dev.to/berthaw82414312,"Selenium, Appium","Test-Driven Development, Exploratory Testing, Regression Testing, Unit Testing, Integration Testing" +https://dev.to/testscenario/what-is-performance-testingtypes-of-performance-testing-4mfi,,"Regression Testing, Load Testing" +https://dev.to/keploy/test-data-generator-enhancing-software-testing-efficiency-2njm,,"Regression Testing, Acceptance Testing, Load Testing" +https://dev.to/dataform/how-to-write-unit-tests-for-your-sql-queries-2hd7,,"Regression Testing, Unit Testing" +https://dev.to/glensmith088/top-trends-in-software-testing-using-ai-ml-in-2020-2l1i,Selenium, +https://dev.to/bryla_piotr/list-of-54-uk-companies-hiring-for-tech-internships-now-4inf,,"Unit Testing, Integration Testing" +https://dev.to/adityabhuyan/best-practices-for-data-security-in-big-data-projects-78p,"Selenium, Appium", +https://dev.to/sukumaar/apache-spark-unit-testing-strategies-451j,"JUnit, JUnit","Test-Driven Development, Unit Testing" +https://dev.to/lambdatest/top-automation-testing-trends-to-look-out-in-2020-o2e,"Selenium, TestNG, Appium, Jest","Exploratory Testing, Regression Testing" +https://dev.to/emilyjohnsonready/unlock-10-secrets-to-90-data-migration-success-59db,Selenium, +https://dev.to/meghasharmaaaa/devops-toolchain-mlo,"JUnit, Selenium, TestNG, JUnit", +https://dev.to/t/testing/page/73,"Selenium, Postman, Jest","Regression Testing, Integration Testing" +https://dev.to/adityabhuyan/scala-vs-java-the-superior-choice-for-big-data-and-machine-learning-enm,Selenium, +https://stackoverflow.com/questions/65215835/how-to-generate-big-data-volume-to-perform-load-test-using-jmeter,,Load Testing +https://stackoverflow.com/questions/63242862/can-we-do-big-data-load-testing-by-using-java-request-sampler,,Load Testing +https://stackoverflow.com/questions/62638491/jmeter-hbase-testing-how-to-preform-the-load-testing-for-big-data,,Load Testing +https://medium.com/@seckindinc/preventing-data-quality-issues-with-unit-testing-1b0565d3a4db,,Unit Testing +https://kovidrathee.medium.com/data-quality-and-testing-frameworks-316c09436ab2?responsesOpen=true&sortBy=REVERSE_CHRON,Cucumber,Unit Testing +https://medium.com/@xenonstack/best-practices-for-implementation-of-testing-in-big-data-8f048513af63,,Load Testing +https://medium.com/policygenius-stories/data-warehouse-testing-strategies-for-better-data-quality-d5514f6a0dc9,,Unit Testing +https://medium.com/dbsql-sme-engineering/how-to-build-an-end-to-end-testing-pipeline-with-dbt-on-databricks-cb6e179e646c,,Unit Testing +https://medium.com/weareservian/the-test-pyramid-and-data-engineering-with-julia-e4678c3f8dff,,Unit Testing +https://medium.com/data-engineer-things/mastering-data-quality-10-essential-checks-with-real-world-examples-and-7-best-practices-fa303f2ae42b,,Regression Testing +https://medium.com/data-quality-beyond/beginners-guide-to-data-testing-e2258a910c22,,"Unit Testing, Integration Testing, Acceptance Testing" +https://medium.com/@wyaddow/maintain-data-quality-with-data-refactoring-tests-f46580d0b43e,,Regression Testing +https://medium.com/people-ai-engineering/data-quality-automation-with-apache-spark-ac87cbbf3c37,,Integration Testing +https://medium.com/@sachan.pratiksha/mastering-the-unittest-module-in-python-best-practices-for-big-data-testing-with-pyspark-0549259d7e69,"JUnit, JUnit", +https://medium.com/womenintechnology/unit-tests-for-better-data-quality-0c19014a948c,,"Unit Testing, Integration Testing" +https://medium.com/@MWanyoike/best-practices-for-dataops-testing-how-to-ensure-data-quality-and-accuracy-through-effective-2b156423e143,,"Regression Testing, Integration Testing" +https://medium.com/@shakti.garg/test-driven-development-tdd-for-big-data-project-9b626149fa76,"JUnit, JUnit",Unit Testing +https://medium.com/analytics-vidhya/data-lake-and-quality-assurance-2dd5de3a0e67,,Smoke Testing +https://www.linkedin.com/advice/0/how-can-you-design-data-quality-test-cases-skills-data-quality,Selenium, +https://www.linkedin.com/pulse/big-data-test-environment-setup-rohini-nair?trk=articles_directory,Selenium, +https://www.linkedin.com/advice/1/how-do-you-use-unit-testing-integration-big-data-projects,"JUnit, Selenium, TestNG, Cucumber, JUnit","Test-Driven Development, Behavior-Driven Development, Regression Testing, Unit Testing, Integration Testing, Acceptance Testing, Smoke Testing, Load Testing" +https://www.linkedin.com/pulse/big-data-testing-complete-guide-testrigor-j9gle,,"Regression Testing, Integration Testing, Load Testing" +https://www.linkedin.com/pulse/big-data-testing-smriti-saini-1e,,"Acceptance Testing, Load Testing" +https://www.linkedin.com/advice/0/how-do-you-reduce-data-quality-testing-risks-your,,"Regression Testing, Unit Testing, Integration Testing" +https://www.linkedin.com/pulse/big-data-testing-bugs-allowed-alexander-protasov,Selenium,Test-Driven Development +https://www.linkedin.com/advice/3/how-can-you-prepare-future-data-quality-testing,,"Test-Driven Development, Unit Testing, Integration Testing" +https://www.linkedin.com/pulse/week-19-tdd-big-data-domain-role-unit-testing-marabesi-matheus-,,"Test-Driven Development, Exploratory Testing, Unit Testing" +https://www.linkedin.com/pulse/big-data-testing-overview-rohini-nair,Selenium, +https://www.linkedin.com/posts/colin-manko_we-need-to-talk-about-data-quality-data-activity-7249050675496583168-YeUy,,"Unit Testing, Integration Testing" +https://www.linkedin.com/advice/3/how-can-you-ensure-data-quality-testing-alignment,,"Unit Testing, Integration Testing, Acceptance Testing" +https://www.linkedin.com/pulse/navigating-complexities-challenges-big-data-testing-muhammad-usman-ogt5f,"Selenium, Cucumber, Appium", +https://www.linkedin.com/advice/0/how-can-you-identify-resolve-data-quality-issues-1e,,Regression Testing +https://www.linkedin.com/pulse/big-data-testing-smriti-saini?trk=articles_directory,,Acceptance Testing +https://www.linkedin.com/posts/chrisbergh_data-observability-and-data-quality-testing-activity-7216467879272087552-A24z,,Smoke Testing +https://www.linkedin.com/pulse/rethinking-big-data-testing-glint-phoropter-jordan-bonilla,,Unit Testing +https://www.linkedin.com/pulse/technical-testing-big-data-kushan-amarasiri,"Selenium, TestNG", +https://www.linkedin.com/advice/0/what-best-way-align-testing-big-data-requirements-05eye,Selenium, +https://www.linkedin.com/pulse/unlocking-insights-art-big-data-testing-quality-part-solanki,"Selenium, Appium", +https://www.linkedin.com/pulse/road-map-part-2-from-non-tech-big-data-testing-krishna-kayaking?trk=public_profile_article_view,,Exploratory Testing diff --git a/docs_to_import/RSL-Daase2024/Advancing beyond technicism-2022.pdf b/docs_to_import/rsl_daase2024/Advancing beyond technicism-2022.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Advancing beyond technicism-2022.pdf rename to docs_to_import/rsl_daase2024/Advancing beyond technicism-2022.pdf diff --git a/docs_to_import/RSL-Daase2024/An enhanced grey wolf optimizer boosted.pdf b/docs_to_import/rsl_daase2024/An enhanced grey wolf optimizer boosted.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/An enhanced grey wolf optimizer boosted.pdf rename to docs_to_import/rsl_daase2024/An enhanced grey wolf optimizer boosted.pdf diff --git a/docs_to_import/RSL-Daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf b/docs_to_import/rsl_daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf rename to docs_to_import/rsl_daase2024/An industry 4.0 approach to large scale production of satellite 2022.pdf diff --git a/docs_to_import/RSL-Daase2024/Assessing business value of Big Data 2017.pdf b/docs_to_import/rsl_daase2024/Assessing business value of Big Data 2017.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Assessing business value of Big Data 2017.pdf rename to docs_to_import/rsl_daase2024/Assessing business value of Big Data 2017.pdf diff --git a/docs_to_import/RSL-Daase2024/BIGOWL2019.pdf b/docs_to_import/rsl_daase2024/BIGOWL2019.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/BIGOWL2019.pdf rename to docs_to_import/rsl_daase2024/BIGOWL2019.pdf diff --git a/docs_to_import/RSL-Daase2024/Big data analytics 2022.pdf b/docs_to_import/rsl_daase2024/Big data analytics 2022.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Big data analytics 2022.pdf rename to docs_to_import/rsl_daase2024/Big data analytics 2022.pdf diff --git a/docs_to_import/RSL-Daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf b/docs_to_import/rsl_daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf rename to docs_to_import/rsl_daase2024/Implementation_of_Big_Data_Analytics_for_Machine_Learning_Model_Using_Hadoop_and_Spark_Environment_on_Resizing_Iris_Dataset.pdf diff --git a/docs_to_import/RSL-Daase2024/Investigating the adoption of big data 2019.pdf b/docs_to_import/rsl_daase2024/Investigating the adoption of big data 2019.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Investigating the adoption of big data 2019.pdf rename to docs_to_import/rsl_daase2024/Investigating the adoption of big data 2019.pdf diff --git a/docs_to_import/RSL-Daase2024/Performance in Distributed Big Data.pdf b/docs_to_import/rsl_daase2024/Performance in Distributed Big Data.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Performance in Distributed Big Data.pdf rename to docs_to_import/rsl_daase2024/Performance in Distributed Big Data.pdf diff --git a/docs_to_import/RSL-Daase2024/Quality Assurance for Big Data Application.pdf b/docs_to_import/rsl_daase2024/Quality Assurance for Big Data Application.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Quality Assurance for Big Data Application.pdf rename to docs_to_import/rsl_daase2024/Quality Assurance for Big Data Application.pdf diff --git a/docs_to_import/RSL-Daase2024/Schema on read modeling approach as a basis of.pdf b/docs_to_import/rsl_daase2024/Schema on read modeling approach as a basis of.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/Schema on read modeling approach as a basis of.pdf rename to docs_to_import/rsl_daase2024/Schema on read modeling approach as a basis of.pdf diff --git a/docs_to_import/RSL-Daase2024/White-Box Testing of Big Data Analytics with Complex.pdf b/docs_to_import/rsl_daase2024/White-Box Testing of Big Data Analytics with Complex.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/White-Box Testing of Big Data Analytics with Complex.pdf rename to docs_to_import/rsl_daase2024/White-Box Testing of Big Data Analytics with Complex.pdf diff --git a/docs_to_import/RSL-Daase2024/alexandrov2013.pdf b/docs_to_import/rsl_daase2024/alexandrov2013.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/alexandrov2013.pdf rename to docs_to_import/rsl_daase2024/alexandrov2013.pdf diff --git a/docs_to_import/RSL-Daase2024/chen2018.pdf b/docs_to_import/rsl_daase2024/chen2018.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/chen2018.pdf rename to docs_to_import/rsl_daase2024/chen2018.pdf diff --git a/docs_to_import/RSL-Daase2024/demirbaga2022.pdf b/docs_to_import/rsl_daase2024/demirbaga2022.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/demirbaga2022.pdf rename to docs_to_import/rsl_daase2024/demirbaga2022.pdf diff --git a/docs_to_import/RSL-Daase2024/ghazal2013.pdf b/docs_to_import/rsl_daase2024/ghazal2013.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/ghazal2013.pdf rename to docs_to_import/rsl_daase2024/ghazal2013.pdf diff --git a/docs_to_import/RSL-Daase2024/gulzar2018.pdf b/docs_to_import/rsl_daase2024/gulzar2018.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/gulzar2018.pdf rename to docs_to_import/rsl_daase2024/gulzar2018.pdf diff --git a/docs_to_import/RSL-Daase2024/peng2020.pdf b/docs_to_import/rsl_daase2024/peng2020.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/peng2020.pdf rename to docs_to_import/rsl_daase2024/peng2020.pdf diff --git a/docs_to_import/RSL-Daase2024/prom-on2014.pdf b/docs_to_import/rsl_daase2024/prom-on2014.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/prom-on2014.pdf rename to docs_to_import/rsl_daase2024/prom-on2014.pdf diff --git a/docs_to_import/RSL-Daase2024/rabl2015.pdf b/docs_to_import/rsl_daase2024/rabl2015.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/rabl2015.pdf rename to docs_to_import/rsl_daase2024/rabl2015.pdf diff --git a/docs_to_import/RSL-Daase2024/shapira2016.pdf b/docs_to_import/rsl_daase2024/shapira2016.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/shapira2016.pdf rename to docs_to_import/rsl_daase2024/shapira2016.pdf diff --git a/docs_to_import/RSL-Daase2024/skracic2017.pdf b/docs_to_import/rsl_daase2024/skracic2017.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/skracic2017.pdf rename to docs_to_import/rsl_daase2024/skracic2017.pdf diff --git a/docs_to_import/RSL-Daase2024/staegemann2019.pdf b/docs_to_import/rsl_daase2024/staegemann2019.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/staegemann2019.pdf rename to docs_to_import/rsl_daase2024/staegemann2019.pdf diff --git a/docs_to_import/RSL-Daase2024/xia2019.pdf b/docs_to_import/rsl_daase2024/xia2019.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/xia2019.pdf rename to docs_to_import/rsl_daase2024/xia2019.pdf diff --git a/docs_to_import/RSL-Daase2024/zhang2017.pdf b/docs_to_import/rsl_daase2024/zhang2017.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/zhang2017.pdf rename to docs_to_import/rsl_daase2024/zhang2017.pdf diff --git a/docs_to_import/RSL-Daase2024/zhang2018.pdf b/docs_to_import/rsl_daase2024/zhang2018.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/zhang2018.pdf rename to docs_to_import/rsl_daase2024/zhang2018.pdf diff --git a/docs_to_import/RSL-Daase2024/zhang2019.pdf b/docs_to_import/rsl_daase2024/zhang2019.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/zhang2019.pdf rename to docs_to_import/rsl_daase2024/zhang2019.pdf diff --git a/docs_to_import/RSL-Daase2024/zheng2017.pdf b/docs_to_import/rsl_daase2024/zheng2017.pdf similarity index 100% rename from docs_to_import/RSL-Daase2024/zheng2017.pdf rename to docs_to_import/rsl_daase2024/zheng2017.pdf diff --git a/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt b/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt new file mode 100644 index 0000000..a20cfc0 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/100-Scalable Approaches for Test Suite Reduction.txt @@ -0,0 +1,160 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2019 IEEE/ACM 41st International Conference on Software Engineering (ICSE) +Scalable Approaches for Test Suite Reduction +Emilio Cruciani∗, Breno Miranda†§, Roberto Verdecchia∗‡, and Antonia Bertolino§ +∗Gran Sasso Science Institute | L’Aquila, Italy +†Federal University of Pernambuco | Recife, Brazil +‡Vrije Universiteit Amsterdam | Amsterdam, The Netherlands +§ISTI – Consiglio Nazionale delle Ricerche | Pisa, Italy +∗emilio.cruciani@gssi.it | †bafm@cin.ufpe.br | ‡roberto.verdecchia@gssi.it | §antonia.bertolino@isti.cnr.it + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract—Test suite reduction approaches aim at decreasing software regression testing costs by selecting a representative subset from large-size test suites. Most existing techniques are too expensive for handling modern massive systems and moreover depend on artifacts, such as code coverage metrics or specification models, that are not commonly available at large scale. We present a family of novel very efficient approaches for similarity- based test suite reduction that apply algorithms borrowed from +the big data domain together with smart heuristics for finding +an evenly spread subset of test cases. The approaches are very general since they only use as input the test cases themselves (test source code or command line input). We evaluate four approaches +in a version that selects a fixed budget B of test cases, and also in an adequate version that does the reduction guaranteeing some fixed coverage. The results show that the approaches yield a fault detection loss comparable to state-of-the-art techniques, while providing huge gains in terms of efficiency. When applied to a suite of more than 500K real world test cases, the most efficient of the four approaches could select B test cases (for varying B values) in less than 10 seconds. +Index Terms—Clustering, Random projection, Similarity- based testing, Software testing, Test suite reduction. +I. INTRODUCTION +In recent years testing has consistently been the most ac- tively investigated topic of main software engineering confer- ences [6]. One prominent problem in software testing research can be abstracted as: Given a software S and an associated test suite T, how can we efficientlyverify whether S passes on T, or -if not- identify the failing test cases? In this formulation, the emphasis is on the term “efficiently”: Otherwise, the easy solution would be to just execute S on T. The research targets the common practical case that along the development process S needs to be repeatedly tested on T (see, e.g., [15]) and the plain retest-all strategy may be too costly considering the available resources (e.g., time). +To address the above question, in the last three decades many techniques have been proposed, which can be roughly divided in two groups: those that aim at reordering the test cases in T so that those more likely to fail are executed first (test case prioritization), and those that select a subset T ⊆ T that should ideally include the failing test cases, if any; the latter group of techniques is referred to as test case selection or test suite reduction,1 depending on whether when choosing +1Some authors use the term minimization in place of reduction when the not selected test cases are permanently removed from the test suite. Here, in line with [34], we will consider the two terms as interchangeable. +1558-1225/19/$31.00 ©2019 IEEE DOI 10.1109/ICSE.2019.00055 +T the changes made to S are considered (modification-aware regression testing) or not [34]. +The proposed techniques have been evaluated and compared against each other using metrics relative to their fault detection effectiveness (e.g., the Average Percentage of Fault Detection of the reordered test suite, or the loss in faults detected by the reduced test suite T ); for test reduction and selection, also metrics relative to cost savings, e.g., the size or the execution time of T are compared against those of the full suite T. +Another important factor that should be taken into account is the cost of the technique itself, both in terms of the compu- tational effort and of the resources it requires. In other words, when evaluating whether investing on an automated approach aimed at reducing the cost of testing is worth, a complete cost- benefit analysis should also include the overheads implied by the approach [18]. +However, not many of the proposed techniques have consid- ered such implied costs. In 2004, Orso and coauthors already noticed that in regression testing efficiency and precision need to be traded off, because “precise techniques are generally too expensive to be used on large systems” [29]. Gligoric and coauthors [16] were the first to observe that the time consumed by any regression test technique should include an analysis phase, an execution phase, and a collection phase. They noticed that most authors only considered the savings in execution, a few measured also the analysis time, but no one before them measured also the last phase in which the information needed to apply the technique is collected. As pointed out by Elbaum and coauthors [15], at scale industries need approaches “that are relatively inexpensive and do not rely on code coverage information”. In fact, for white-box techniques, the cost of collecting and saving up-to-date code coverage information should also be considered as part of the collection phase. This is confirmed by Herzig [19], who observes that code coverage is not for free as assumed in many works, and can cause up to 30% of time overhead! +In a recent work [28], we addressed the prioritization of very large test suites and showed that as the size of the test suite grows, most existing approaches become soon not applicable. That work proposed the FAST family of similarity-based test prioritization approaches that outperformed in efficiency and scalability all the compared approaches, except for the white- box greedy total approach. If we count the often ignored +419 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +costs of measuring coverage, then FAST appears as the only scalable prioritization approach. +This paper introduces a family of scalable approaches for test suite reduction, called the FAST-R family. As in [28], FAST-R approaches are similarity-based and borrow tech- niques from the big data domain. However, with respect to [28] we apply here several new techniques that allow us to achieve even more efficient results. In FAST we used minhashing and locality-sensitive hashing algorithms [25]. FAST-R approaches adopt other efficient heuristics that are used to derive a set of B evenly spread points in a big data space. Precisely, one approach called FAST++ applies the k-means++ algorithm [4], while another one called FAST-CS uses a recent importance sampling algorithm to construct coresets, a clustering technique that scales up to massive datasets [5]. Moreover, we further enhance the scalability of both approaches by applying the random projection technique, that reduces the space dimensionality while preserving the pairwise distances of the points [21]. +FAST++ and FAST-CS are extremely “practical” techniques in the sense required by all of [15], [16], [19], [28]: i) thanks to the heuristics imported from the big data domain they are computationally very efficient; ii) to reduce a test suite T they require no other information beyond T itself. +Based on the applied algorithms, the most natural scenario for FAST++ and FAST-CS is that of finding a fixed budget B of test cases. This is referred in literature as inadequate test suite reduction. In the paper we also show how they can be adapted to perform adequate reduction, i.e., preserving coverage: We apply a filtering strategy and search for the most dissimilar test cases only among the ones that cover not yet covered elements. However we acknowledge that at large scale such adequate scenario is not realistic, because as already said coverage information cannot be assumed. +Although originally proposed for prioritization, we note that FAST approaches [28] could be easily adapted for test reduc- tion: Instead of ordering the whole test suite, the algorithm is stopped when the budget B (or the desired coverage) is reached. Accordingly, we also include in FAST-R and evaluate the reduction version of FAST-pw and FAST-all (the most precise and the most efficient of the FAST family). +Summarizing, this paper proposes four test suite reduction approaches (two original ones and two adapted from [28]) that can be applied in two testing scenarios: under a fixed budget or for adequate test suite reduction. +We evaluated the four proposed approaches on commonly used C and Java benchmark programs against state-of-the- art reduction techniques, obtaining comparable results for effectiveness but notable improvements in efficiency. More interestingly, to validate our claims on the scalability of the approaches, we applied all four of them to the budget reduction of a test suite formed by more than 500K Java test cases collected from GitHub. At such large scale, not considering the preparation time, FAST-pw and FAST++ required several hours to reduce the suite, e.g., ∼37 hours and ∼11 hours respectively for a 10% size, but FAST-all required 25 seconds +and FAST-CS 9 seconds. Actually, FAST-CS looks as a real breakthrough as it took less than 10 seconds for the reduction independently from the percentage, and needed just 5 minutes for preparation in contrast to more than 3 hours taken by FAST-all. +The original contributions of this work include: +• The FAST-R family of scalable approaches for inade- quate test suite reduction. +• A variant of all the approaches for adequate test suite reduction. +• A large-scale experimentation for evaluating the effi- ciency and effectiveness of the approaches in three sce- narios, including a very large-scale test suite. +• An open-source automated framework along with all the data used for the experiments to support verifiability. +The paper is structured as follows. In the next section we survey related work. In Section III we present the approaches used. In Section IV and V, respectively, we present the evalua- tion methodology and the achieved results. Finally, Section VI draws conclusions and hints at future work. +II. RELATED WORK +This work is related to software regression testing and more specifically to test suite reduction techniques. The literature on software regression testing is huge: Two surveys [13], [35] provide a broad overview of prioritization, reduction (or minimization, used here in interchangeable way), and selection techniques. In particular, Yoo and Harman [35] reviewed the literature until 2009. Concerning reduction techniques, most of the surveyed works consists of heuristics over white-box coverage criteria, at various level of granularity (including statement, branch, function, or call-stack). Some approaches augment the coverage information with additional inputs by the tester (e.g., weighting coefficients or priority assignments), which may be costly or even biased [35]. Among the few “interesting exceptions” doing black-box reduction, they report some combinatorial, fault-based, and model-based techniques. More recently, Do [13] surveys further advances over [35]. In particular, for test suite reduction she reviews four more recent techniques, two of which are again coverage-based, and two ones introduce specific reduction techniques: one for GUI testing [3], and another for combinatorial interaction testing [7]. Note that both surveys [13], [35] include no work on similarity-based test suite reduction, as we propose here. +A recent systematic survey by Rehman and coauthors [23] focuses specifically on test suite reduction. The study sur- veyed the literature between 1990 and 2016, identifying a set of 113 relevant primary studies. Based on the adopted algorithms, they classify the approaches into: Greedy (mostly coverage-based), Clustering, and Search-based, plus hybrid combinations thereof. Our approach would fitin the Clustering group, in which out of the surveyed 113 studies they only find three works: one [8] using machine learning algorithms, and two [27], [33] using hierarchical clustering. +We take here a distance from most of the techniques surveyed in the above studies, since FAST-R is expressly +motivated by considerations of scalability and practical ap- plicability. In this perspective, our approach is more closely related to few recent works based on coarse-grained heuristics, clustering, and similarity. +In recent years some collaborative efforts between academic and industrial researchers start to appear that develop coarse- grained approaches trading precision with efficiency/scalabil- ity. Strictly speaking such works focus on test case selec- tion and not test suite reduction, in that the choice of tests to execute is modification-aware. For example, Knauss and coauthors [24] use a statistical model that relates the changed code fragments (or churns) with test outcomes on Ericsson systems; considering a continuous integration development environment, Elbaum and coauthors [15] propose a strategy apt for Google testing process, which combines test case selection during pre-submit testing and test case prioritization in post-submit testing. Both selection and prioritization apply heuristics based on failure history and execution windows. By relying on very efficient algorithms, our FAST-R approaches can scale up to large industrial systems as the above works, while not sacrificing much of precision in deriving a represen- tative subset of the test cases. +Our similarity-based approach is related to several tech- niques that exploit the diversity among test cases for guiding selection. Some techniques build on the notion of adaptive random testing (ART) [10] that, in a few words, first selects a random set of test cases and then filters them based on their distance from the already selected test cases. Several variants instantiations of ART have been proposed, including ART-D [20] and ART-F [36] that we use as competitors to FAST-R and that are further described in Section IV. +Some black-box approaches use similarity to reduce model- based test suites. Both test case reduction [2] and test case selection [9], [17] techniques have been proposed. These techniques have been conceived for industrial use: For example Hemmati and coauthors [17] pursue as a main goal a selection of test cases adjusted to the available testing budget. However, all such model-based approaches rely on the assumption that a formal model of program behavior, e.g., a LTS, is available. In contrast, FAST-R does not need to assume anything else beyond the test cases themselves. +A few works have proposed to leverage clustering of test cases as we do here, e.g., [11], [30]. However they calculate the similarity between two test cases based on code coverage information, which as said already could be too expensive at +the testing scale we aim. +III. THE APPROACHES +Given a test suite T and some fixed budget B ≤ | T|, the goal of similarity-based test suite reduction is to select B evenly spread test cases out of the test suite. If we model each test case as a point in some D-dimensional space, then the problem could be thought of as that of finding the central points of B clusters. The problem of clustering is NP -hard, but we are able to perform scalable similarity-based test suite +1. Test Suite 3. Random Projection +t1: grep -e 'foo' file t1 t2: grep -v -e 'foo' file +t2 t3: grep -F 'bar' file +t3 +Comp1Comp2Comp3 +2. Vector Space Model (Term Frequency) +t1 t2 +t3 +grep -e -v -F 'foo''bar' file +Fig. 1: Visual representation of FAST-R preparation phase. +reduction by borrowing a technique from the big data domain and using it in combination with some efficient heuristics. +We consider an Euclidean space, a metric space where the distance between any two points is expressed by the Euclidean distance – what one could think of as the straight line connect- ing them. Let x, y ∈RD be two points; the Euclidean distance +between them is defined as d(x, y) = i=1 (x i − yi )2. +D +In the preparation phase of our approaches (Fig. 1) we transform test cases into points in the Euclidean space via the vector-space model: The textual representation of each test case, e.g., test source code or command line input (Fig. 1.1), is mapped into an n-dimensional point where each dimension corresponds to a different term of the source code and n is equal to the total number of terms used in the whole test suite. The components are weighted according to term-frequency scheme, i.e., the weights are equal to the frequency of the corresponding terms (Fig. 1.2). +The computation of the Euclidean distance between any two n-dimensional points can be expensive when n is large. To overcome this problem we exploit a dimensionality reduc- tion technique called random projection. Roughly speaking, random projection works because of Johnson-Lindenstrauss Lemma [21], which states that a set of points in a high- dimensional space can be projected into a much lower- dimensional space in a way that pairwise distances are nearly preserved. In particular we use sparse random projection [1], [26], an efficient implementation of the technique that is suitable for database applications (Fig. 1.3). +We model the clustering problem as a k-means problem, with k = B. Given n points in a metric space, the goal of k- means is to find a k-partition P = {P1,...,P k} of the points that minimizes the sum of the squared Euclidean distances between each point to its closest center of one partition. Formally, the goal is to find argmin k d(x, μ )2, +i +P i=1 x ∈P i +where μ i is the center of the points belonging to partition Pi. +There exist efficient techniques that are able to find an approximate solution to k-means. One is k-means++ [4], +Algorithm 1 FAST++ +Input: Test Suite T; Budget B +Output: Reduced Test Suite R +1: P ← RandomProjection(T ) Preparation phase 2: s ← FirstSelection(P ) +3: R ← List(s) +4: D ← Distance() Squared distance to closest point in R 5: D(s) ← 0 +6: while (Size(R) < B) do +8: for ifalld tP∈(Pt),doP (s) 2 < D (t) then +7: +9: D(t) ← d P (t),P (s) 2 Squared Euclidean distance 10: s ← ProportionalSample( P,D) +11: R ← Append(R,s ) +12: D(s) ← 0 +13: return R +which achieves an O(log k) approximation ratio2 in expec- tation and finds the centers of the clusters in k linear time iterations. The algorithm is the de facto standard technique for the initialization phase of k-means algorithms. After the initial centers are selected, standard k-means algorithms would iteratively compute the clusters. In our case, to be more efficient, we stop at this stage and use the k selected centers as the test cases of the reduced test suite. The reduction approach that exploits k-means++ as greedy reduction strategy is called FAST++ (Algorithm 1). +FAST++ starts by preprocessing the test suite T, mapping each test case into a vector according to the vector-space model and then lowering its dimensionality via random projection (Line 1). After the preparation phase, the reduction algorithm works only on the projected data P on which the greedy selection of k-means++ is applied. First, pick the first point uniformly at random3 (Line 2). Then, until B points have not been selected: i) for each projected point t ∈P , compute the squared distance d(t,R)2 between t and its nearest center in R that has been already picked (Lines 7, 8, 9); this can be done incrementally by maintaining the minimum distance and computing only the distance with the last selected point (Lines 8, 9); ii) pick next point s with probability proportional to its distance to R (Line 10). +Another possible approach to simplify the clustering prob- lem is that of using coresets. Given a set of points S, a coreset is a small subset of S that well approximates the geometric features of S. One usually constructs a coreset first and then finds the centers of the clusters on it, reducing the complexity of the problem while still having theoretical guarantees on the solution. In our case, though, the size of the reduction grows linearly with the size of the test suite making this standard approach less efficient – the complexity of the problem would not lower much. Instead, exploiting a recent extremely efficient algorithm developed for massive datasets [5], we construct a coreset of size B and use it as reduced test suite. The algorithm is based on importance sampling: All points have nonzero +2In a minimization problem, an α-approximation algorithm finds a solution which is not worse than α times the optimum. +3Note that this is to stick with k-means++ algorithm, but any other criterion for the choice of the first test case is possible. +Algorithm 2 FAST-CS +Input: Test Suite T; Budget B +Output: Reduced Test Suite R +1: P ← RandomProjection(T ) Preparation phase 2: μ ← Mean(P ) +3: for all t ∈ P do +1 d P (t), μ 2 +4: Q(t) ← + Importance sampling +2|T | t ∈P d P (t ), μ 2 +5: R ← ProportionalSampleWithoutReplacement( P,Q,B ) +6: return R +probability of being sampled, but points that are far from the center of the dataset (potentially good centers for a clustering) are sampled with higher probability. We call the reduction approach that use this technique FAST-CS (Algorithm 2). +FAST-CS starts with the preparation phase to compute the set of projected points P (Line 1). Then, it only requires two full passes on P : First it computes the mean of the data points (Line 2) and then it uses it to compute the importance sampling distribution (Lines 3, 4). The probability of each point to be sampled is a linear combination of the uniform distribution (first term in Line 4) and of the distribution which is proportional to the squared Euclidean distance between the data point and the mean of the data (second term in Line 4). Then B points are sampled out of P without replacement with probability proportional to their importance sampling probability (Line 5) and used as reduced test suite. +Both FAST++ and FAST-CS have also been adapted to be adequate, i.e., to perform a reduction that guarantees some fixed coverage. 4 Getting coverage information of each test case as an extra input, both the proposed approaches are able to reduce the test suite such that some fixed coverage is achieved. This is possible thanks to a filteringphase. In FAST++, all test cases which would not add any extra coverage are filtered out after each selection and the next selection is carried out only among the remaining ones. As for FAST-CS, log|T| test cases are picked at each subsequent iteration and then importance sampling probabilities are recomputed setting to 0 the ones relative to test cases which are filtered out. Picking log|T| tests per iteration instead of just one makes the algorithm scale better to big test suites. Moreover, this choice does not increase the size of the reduced test suite since the selected test cases are still diverse among them and thus the chance of covering different parts of the software under test is still high. Finally, instead of stopping when the reduction reaches size B, both adequate approaches stop whenever the reduction achieves some fixed coverage. +As said, this work was inspired by the FAST family of test case prioritization approaches [28]: Roughly speaking, those approaches could be also used for the goal of test suite reduction by only picking the first B test cases of the prioritized test suite. To assess also their efficiency and effectiveness when applied to test suite reduction, we modified +4The pseudocodes of adequate versions are not reported for lack of space, but they can be found online [12]. +all the original algorithms to stop after B test cases are prioritized. Moreover we adapted them to be adequate as well, again using the same filtering phase introduced in FAST++ and FAST-CS. +IV. EVALUATION METHODOLOGY AND SETUP +We conducted some experiments to evaluate the effective- ness and the efficiency of the proposed approaches in different application scenarios. As a first scenario we considered the case in which test resources are limited and a tester can only run a small subset of test cases from an existing test suite: We call this the budget scenario, because we fix a priori a reduction percentage of test suite size. In this scenario we can apply the natural version of the proposed approaches. As a second case we considered adequate scenario, in which the code coverage measures of the whole test suite are preserved. To study this scenario, we applied the adequate version of the approaches. We also studied a third case, called the large- scale scenario, in which we apply the inadequate reduction on a very large test suite. +A. Research Questions +We address the following research questions (RQs): +RQ1: How effective are the proposed test suite reduction ap- proaches in comparison with state-of-the-art techniques? +The goal of test suite reduction is to reduce the size of a test suite while maintaining its fault detection effectiveness. Thus the effectiveness of reduction approaches is commonly measured in terms of the Fault Detection Loss (FDL), and for adequate approaches also in terms of Test Suite Reduction (TSR). Consequently we articulate the above RQ1 into the two following subquestions: +RQ1.1: [FDL] What is the fault detection loss of the pro- +posed approaches compared with that of state-of-the-art techniques? +To answer RQ1.1 we measure: FDL = |F |−|F | , where F is +|F | +the set of faults detected by T and F is the set of faults detected by T . +RQ1.2: [TSR] What is the test suite reduction achieved by +the proposed approaches compared with that of state-of- the-art techniques? +To answer RQ1.2 we measure: TSR = |T |−|T| |T | . +We answer RQ1.1 in both budget and adequate scenarios, and RQ1.2 only in the adequate scenario. +To evaluate the efficiency we address the following RQ: +RQ2: How much time is taken by the proposed approaches +to produce the reduced test suite? +We measure the time spent in preparation and in reduction. We answer RQ2 in all the three scenarios: In the budget and adequate scenarios we compare the time taken by the proposed approaches against state-of-the-art competitors; in the large- scale scenario we could only apply our proposed techniques, as all competitors approaches require coverage information that at such scales are not available. +B. Compared reduction approaches +We recall that the FAST-R family of proposed approaches consists of the newly devised FAST++ and FAST-CS plus the modified reduction versions of FAST-pw and FAST-all, first introduced for prioritization [28]. +The competitor approaches we consider are ART-D [20] and ART-F [36], which belong to the family of Adaptive Random Testing techniques [10]. In brief, they both work by first deriving a candidate set of test cases from those not yet selected that would increase coverage, and then selecting from within the candidate set the most distant test case from those already selected. The two techniques differ on the candidate set size (Dynamically changing in ART-D and Fixed in ART-F) and on the adopted distance metric (Jaccard and Mahattan, respectively). We selected these approaches because they also aim at obtaining an evenly spread set of test cases as in our approaches, and also because in the results reported in [28] they were among the best competitors to FAST. Differently from FAST-R, ART-D and ART-F use coverage measures. +Finally, we also applied the GA (Greedy Additional) ap- proach [31], which for its simplicity and effectiveness is often considered as a baseline. GA selects the test case that covers the highest number of yet uncovered elements. +For all three competitors we consider three variants, applied to coverage of function, statement, and branch. +C. Experiment material +To evaluate the budget scenario and the adequate scenario we took 5 C and 5 Java programs as experimental subjects. The C programs (consisting of Flex v3, Grep v3, Gzip v1, Sed v6, and Make v1) were gathered from the Software In- frastructure Repository (SIR) [14]. For each of these programs subsequent versions are available, each containing a varying number of seeded faults. In our experiment we considered for each program the version containing the highest number of difficult to reveal faults, i.e., faults that are discovered by less than 50% of the test cases. This was done to avoid including in the experiment “anomalous” versions, e.g., versions in which most faults are revealed by the majority of the test cases or no faults are revealed at all. In total, the C subjects amounted to 52,757 LoC containing 49 faults, and were accompanied by a test suite comprising 2,938 test methods. +The 5 Java programs taken into account (namely Closure Compiler, Commons Lang, Commons Math, JfreeChart, and Joda-Time) were taken from the Defects4J database [22]. Such database provides a set of programs available in different versions, each containing a single real fault. For our exper- iment, we considered the first version of the programs. In total, the Java Subjects amounted to 320,990 LoC and were accompanied by a test suite comprising 1198 test classes. +To evaluate the large-scale scenario, we used a set of more than 500K real-world test cases gathered through the GitHub hosting-service. To efficiently collect a high number of heterogeneous test cases, we selected classes committed to the master branches of the available Java repositories, precisely commits adding a single class which adheres to common +naming conventions for JUnit classes. In total through this process we collected 514,272 test cases, amounting to roughly 39 million LoC for a total size of 14 GB. +D. Experiment procedure +The experiment was performed on an AMD Opteron™ 6376 with 2.3GHz CPU, 16MB L2 cache, 64GB RAM, running Ubuntu 16.04.5 LTS. The procedure varied according to the scenario considered. More specifically: +1) Budget scenario: We fixed a set of budgets B for +each experimental subject (both C and Java). The budgets considered ranged between 1% and 30% of the total test suite size of each subject with a step increase of 1%. While the FAST-R approaches only required the test suite for the reduc- tion process, all competitors could take in input 3 different coverage types, namely function, statement, and branch. We therefore performed a single study for the FAST-R approaches and 3 for each of the competitors. We used each compared approach to reduce the test suite of the experimental subjects by considering all B budgets. The metrics considered were fault detection loss, preparation time, and reduction time. The measurements were repeated 50 times for each study given the stochastic nature of the approaches. +2) Adequate scenario: The FAST-R approaches require +coverage information for the filtering phase as an extra input to have an adequate reduction. The competitor approaches instead require exclusively the coverage information. For this scenario we considered function, statement, and branch cov- erage. We used the compared approaches to reduce the test suite of each experimental subject (both C and Java) so to maintain the coverage prior of the reduction. We measured fault detection loss, test suite reduction, preparation time, and reduction time. The measurements were repeated 50 times for each study given the stochastic nature of the approaches. +3) Large-scale scenario: As for the budget-scenario, we +considered a set of budgets B ranging from 1% to 30% of total test suite size of the subjects, with a step increase of 1%. In this setting we exclusively evaluated FAST-R approaches, as the other approaches require coverage information, which in this scenario is not available. To answer RQ2, we applied the approaches to the GitHub dataset for each possible reduction of B, and measured preparation time and reduction time. +V. RESULTS +In this section we report and discuss the results. Note that with the aim of supporting independent verification and replication, we make available the artifacts produced as part of this work [12]. The replication package includes approaches, input data, statistical analyses, and additional results. +A. The budget scenario +1) Fault Detection Loss: The box plots of Figure 2 display +the FDL of the compared approaches and more details are provided in Table I. The results are grouped by programming language because the C and Java programs investigated contain different types of faults (see Section IV-C). The approaches +c +100 75 50 25 0 + ●●● ●●●●●●●●●●●●●●●● ●●●●●●●● This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +425 diff --git a/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt b/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt new file mode 100644 index 0000000..30468c8 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/102-Quality Assurance in Big Data Analytics.txt @@ -0,0 +1,105 @@ +114 Telfor Journal, Vol. 11, No. 2, 2019. +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Quality Assurance in Big Data Analytics: An IoT Perspective +Nicole Ann Fernandes and Rupali Wagh + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +115 Telfor Journal, Vol. 11, No. 2, 2019. +Abstract —Emergence of IoT as one of the key data contributors in a big data application has presented new data quality challenges and has necessitated for an IoT inclusive data validation ecosystem. Standardized data quality approaches and frameworks are available for data obtained for a variety of sources like data warehouses, webblogs, social media, etc. in a big data application. Since IoT data differs significantly from other data, challenges in ensuring the quality of this data are also different and thus a specially designed IoT data testing layer paves its way in. In this paper, we present a detailed review of existing data quality assurance practices used in big data applications. We highlight the requirement for IoT data quality assurance in the existing framework and propose an additional data testing layer for IoT. The data quality aspects and possible implementation models for quality assurance contained in the proposed layer can be used to construct a concrete set of guidelines for IoT data quality assurance. +Keywords — Big Data, Internet of Things (IoT), Data Quality, Data Testing, IoT data Validation, Quality of Service (QoS). +I. INTRODUCTION +IOdaTyolirvienst ebrunteatlsoof rthevinoglus thioansizneodt othnel ye ncthiraencgoemd pouutri ndga ya ntod analytics paradigm. Today IoT is the key contributor in +making informed decisions across domains. With these connected devices generating enormous data, seamless integration of this data in a big data application for further analytics is the need of the hour. Since quality data is the backbone of any analytical solution, ensuring the quality of big data is a fundamental task in big data testing. Since the poor data quality may produce inaccurate results, a comprehensive data quality assurance framework is followed for big data testing [1]. The famous V’s of big data – volume, variety, velocity, and veracity bring complexities with them. This has been the reason for the inclusion of rigorous data quality check which otherwise was not required in a traditional system [2] data testing. +Paper received October 30, 2018; revised April 4, 2019; accepted May 04, 2019. Date of publication December 25, 2019. The associate editor coordinating the review of this manuscript and approving it for publication was Prof. Miroslav Lutovac. +Nicole Ann Fernandes is a postgraduate student, Department of Computer Science, CHRIST (Deemed to be University), Bengaluru, India (e-mail: fernandes.ann@mca.christuniversity.in). +Rupali Wagh is Associate Professor with the Department of Computer Science , CHRIST (Deemed to be University), Bengaluru, India (e-mail: rupali.wagh@christuniversity.in). +In the last decade, we have witnessed the dominance of IoT and today IoT has become a major contributor in the big data application environment. It brings newer complexities in the big data ecosystem. Vastly different sensors from a huge network of connected devices produce data which require careful and systematic preprocessing before actually being fed for analytics. While the wear and tear of the devices/sensors, faulty devices, etc require actions which may be extrinsic to the computing life cycle, but identification of these issues needs to be done intrinsically by analyzing the captured data. IoT is further challenged by security concerns and network issues as they directly impact the reliability and accuracy of data. Thus, the data validation for IoT data goes beyond just data cleaning, aggregation and transformation, and shifts more towards intelligent and machine learning based methods in data testing like ontologies for data abstraction and predictive methods for threat prediction. Since IoT based big data analytics is becoming more and more prevalent, the data quality issues are becoming very significant. Additionally, IoT analytics due to its ubiquitous nature impacts human life largely and hence ensuring the quality of IoT data has become very critical. +In this paper, we discuss major data quality challenges specifically with respect to IoT data. We also elaborate the implementation models used to assure the quality of IoT data and propose an additional IoT data validation layer, which can act as a basis for constructing an IoT inclusive data quality assurance framework for any big data application. +The paper is organized as follows- Section II elaborates a generic big data test framework, section III emphasizes the dominance of IoT data in today’s big data applications. Section IV presents data quality challenges with respect to IoT data and various implementation models and methods required for IoT data quality assurance. Section V proposes an additional layer in Big data-IoT framework +II. BIG DATA TEST FRAMEWORK +The variety and volume of data have become a challenging aspect to databases. With unstructured, structured, semi-structured data being produced every second, data testing is extremely complex. The 4 V’s Volume, velocity, variety, and veracity of big data demand the unorthodox form of information that enables magnified insight, decision-making. Big data testing is absolutely dissimilar from general testing scenarios as it involves processing huge data quickly for a business to make better decisions. The primary goal of big data testing is cleaning, masking, monitoring big data but none of these deals with + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Fernandes and Wagh: Quality Assurance in Big Data Analytics: An IoT Perspective 116 +data validation in a big data framework which lacks the quality of data. Big data testing is verifying data to ensure data transformation, data quality, and automate the regression testing. +Validation of structured and unstructured data in a test environment increases cost and time. Big data testing is based on Extract, Transform and Load (ETL). In the Extract phase test data is uprooted from various sources, traditional databases like relational database management system (RDBMS), the test data and process are verified and in the transformation phase, once the transformation is successful, it is either sent to the data warehouse or deleted. Quality is a major issue and requires a peculiar infrastructure [2]. Data warehouse staging area is a short-term location where data from all sources are recorded. Since data cannot be extracted directly from all databases at the time, therefore, data in the data warehouse is momentary +Quality Assurance (QA) defines whether a product or service meets the specified requirements. Fig. 1 describes various parameters that could cause tangible and intangible losses to an organization due to poor data quality. Unreliable data leads to wastage of resources, business revenues, decisions, productivity, and prevents data from being shared in an organization. Meeting customer requirements is far beyond the reach if data is not validated and accurate. Due to unreliable systems, low-quality data collections, unorganized data, connectivity issues, technical faults between sensors lead to business loss. Data is said to be reliable and consistent when data collected and analyzed remains substantial over time. Data quality parameters, data accuracy, data timeliness, data accessibility, data accountability, data completeness, data scalability, and data security and their significance are discussed in detail in [1], [4]. + +Fig. 1. Data quality concerns in big data environment. +To ensure the quality of data the following big data quality services are generically employed in a big data testing framework [1], [5], [6]. +· Data collection: Gathering and quantifying information from various sources. +· Data cleaning: Since data is collected from various sources detecting and correcting untrustworthy, inaccurate, corrupt records data is a major role in big data testing which ensures data quality. +· Data transformation: Process of the transfiguration of dataset from a source data system to the format of a destination data system. +· Data loading: Once the data is transformed it is loaded into a big data repository such as NoSQL big database and Hadoop domain. +· Data analytics: Inspection, modeling, and modification of data into reports, conclusion, supports decision- making. +· Data aggregation: The arrangement of data from a database to develop datasets for data processing. +With the high computing requirement and complexities of the processes in the big data testing framework, test as service (TAAS) is gaining popularity in recent years. TAAS is primarily aimed at providing solutions regarding cost, data and packet loss, and scalability issues of IoT devices and test semantic correctness and functional features remotely [2]. TAAS with IoT testing framework rectifies unnecessary cost, traditional software testing in the development of IoT devices, provides real-world testing and reduces strain on internal resources. With emerging Machine learning methods into software testing [3], software, TAAS is becoming more and more relevant [3]. +Existing comprehensive big data quality framework is primarily centered around the data coming from data warehouses, weblogs and social media. Though IoT is an inseparable component of today’s big data application, Inclusion of IoT focused data validation is not yet seen as a mandatory element in the framework. +III. IOT KEY CONTRIBUTOR OF DATA IN BIG DATA APPLICATION +IoT enables things to actively participate in sharing data with other objects, communication over the network (wired/wireless), recognizing changes and events in other objects where things/object can react inaccurately. +The internet of things helps to connect anything with everything. IoT is connected to cellular services like 30% are phones, 23% tablets, and others are machine-to-machine communication. With the advancement of high-speed internet connection like Broadband connectivity, Google fiber which provides high-speed low latency network. +As shown in Fig. 2, it is projected that IoT will grow about 267 billion in 2020 [7]. IoT generates huge information, this information is analyzed, and resets factors based on the emergency. Sensors help to detect motion; a voice call may be sent through the internet or appropriate altars are sent on devices. With the advancement of technology and the use of sophisticated sensors, IoT generated data reduces human efforts and interaction and improves decision analytics. Real Time Data generated by IoT is highly preferred for decision-making because of its high business value. +IoT generated data is seldom analyzed independently and often exists as one component of the big data analytics ecosystem, Fig. 3. Big data and IoT is used widely across domains to provide diverse solutions. Big data analytics is used to examine huge datasets in order to uncover hidden patterns, customer requirements, market trends, business information, better agriculture planning, reduce the cost of + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +117 +Telfor Journal, Vol. 11, No. 2, 2019. +medical systems and decision-making. There are few domains where IoT and big data analytics has become the norm for the functioning of various processes. Health gadgets with various IoT enabled sensors are becoming the backbone of patient monitoring systems and providing phenomenal support to inefficient customer care [8], [9]. IoT devices are being used to monitor and build patient- centric, remote consultation, to help critical conditioned patients [10]. Smart farming includes technologies like IoT, big data, data mining, machine learning techniques, cloud computing which enables farmers to take actions and better- informed decisions on farming practices. Sensors are used on fields and crops which provides data points on soil conditions, detailed information on wind, water availability and pest infections [9]. Sensors like SHT10, SEN0161, Humidity sensor and Obstacle sensor (ultrasonic) are used on various hardware and software that includes AVR microcontroller atmega 16/32, ZigBee module, Raspberry pi, Dip trace, SinaProg, Raspbian Operating system. Thus, it is now possible to monitor productivity with just a click of a button. Smart homes technologies include a suit of IoT devices, appliances, or systems that connect into a network and can be controlled. IoT and big data fabricate the use of accommodating new devices, appliance, and other technologies. IoT is growing exponentially, Sophisticated sensors and chips are embedded into systems that surround us in a smart home environment which comprise of Temperature sensor, Voice/Sound sensors, an Air composition sensor, Infrared sensors, pressure sensors, Video cameras for surveillance. When an unusual motion takes place, an alert message is sent to the user [11], [12], [13], [14]. + +Fig. 2. Worldwide Diversification of IoT Devices, as projected by [7]. +Thus, the amount of data generated by connected devices is tremendously huge. Its assimilation in a big data system is further complicated by the variety, time dependency, compatibility, and interpretability. +IV. QUALITY IOT DATA: CHALLENGES +IoT and big data analytics has almost become omnipresent and also brings data challenges along with it. A Huge number of sensors generating an enormously high volume of diverse data requires a multifaceted data quality assurance approach. In this section, we emphasize three main characteristics of data which are essential for producing valid and applicable results namely data reliability and accuracy, data timeliness and data +interpretability. We discuss the challenges in ensuring these qualities in IoT data and review the state of art of the solutions provided for them. + +Fig. 3. IoT and Big Data Analytics. +A. Reliable and Accurate Data – IoT Security +Security and privacy of data are very crucial to the IoT paradigm. This undoubtedly is the most researched area in the field of IoT, cloud computing and big data because of its high impact on the business value of such systems. Though the solutions to IoT security are based in multiple domains like networks and machine learning, the primary objective is to collect genuine and authentic data. Securing systems is based on a few standard principles: confidentiality, availability, authentication, integrity. Some devices used in IoT have extremely limited storage, battery power, processing rate are unable to cope with the unique security systems and wireless networks are widely used in IoT devices which could lead to packet loss. Security is a widely researched problem in IoT and main security concerns are identified as Eavesdropping, Mac spoofing, Dictionary attack, and Man-in-the-middle attack. [14], [11]. While traditional solutions include encryption and cryptography, a newer research direction based on IoE, internet of entities with blockchain based validation mechanisms is being proposed in the research community [15]. In network security for smart home, domain is proposed in [11] where communication rules for every device are installed in every home router and are further used to filter malicious traffic. The layered architecture of IoT posed challenges in providing end to end privacy and security. Improved privacy preserving the architecture of IoT as proposed in [16] is the need of the hour which is based on the concept of using multiple cloud data stores for preserving privacy. Based on this generic architecture domain specific architecture for more secure data in IoT is also proposed. Application of machine and deep learning approaches for building robust IoT big data applications [5] are effectively used for threat categorization as well as predicting the layer where the threats can surface viz, network services surface/cloud service surface/web application interface, etc. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Fernandes and Wagh: Quality Assurance in Big Data Analytics: An IoT Perspective 118 +B. Data Timelines – Real-Time Data Analytics Models a very high velocity. Recent paradigms like Resource With heterogeneous data coming continuously from Description Framework (RDF) are gaining popularity due +multiple sources spanning multiple geographic locations, to the flexibility that they provide in the continuous query it's difficult to separate valuable data from irrelevant processing [22]. Application of semantic annotations of IoT information. IoT big data analytics is further challenged by data in healthcare domain is discussed in [23]. The paper the need for real-time data updates and its real-time shows semantic annotations of the heterogeneous data analytics due to the continuous operational state of IoT gathered using IoT devices of patients and physicians to devices, thus a “Fog Computing” lightweight computing transform the data into RDF. This data is then processed by paradigm becomes relevant for IoT. Fog computing is SPRARQL (SPARQL Protocol and RDF Query Language) similar to cloud computing which provides temporary facilitating the interoperability across devices. The concept storage, services, and application which provides a of interoperability is very much relevant in all the domains promising solution for big data applications and IoT. Fog of IoT and requires standardized data representation computing is an intermediate layer between cloud formats. These formats essentially describe data as linked computing and data generated from various sources. It objects or entities with characteristics and relationships. reduces the processing time and cost spent on sending huge Example. Ontologies are required further for knowledge data to the cloud. As fog nodes analyze all the data that sharing to interpret the data representation [24]. Semantic needs to be recorded and delivered into the cloud which is interoperability can be challenging: integration of multiple used for prediction or a historical purpose. Fog nodes data sources, a distinctive ontological point of reference, provide optimization approach for an IoT sensing P2P (peer to peer) communication, semantic discovery of application which improves data security and reduces data data sources and services. IoT interconnected devices face latency, faster response. Fog nodes analyze data with standardization and reusability issues due to unpredicted minimum requirements like power and fewer resources by faults. +appending an appropriate sensing module. The performance +level is reduced as data is uploaded into the fog nodes [17]. V. IOT INCLUSIVE QUALITY ASSURANCE FRAMEWORK Fog computing in IoT can eliminate the dependency on a FOR BIG DATA WITH IOT +centralized data center and perform the in-network IoT has made a machine to machine communication computation to reduce the latency in computations. This possible. We propose an additional IoT quality assurance lightweight computation also augments security solutions layer before IoT data is integrated with the generic big data as it allows lightweight encryption schemes through fog-to- application. As shown in Fig. 4, the proposed IoT data things paradigms [18], [19]. Data generated by sensors and validation layer sits on top of the data collection layer. A devices are processed efficiently and closer to where the series of actions proposed in the layer would ensure that the data is originated instead of sending it to a diverse data raw IoT data is transformed into suitable abstraction before center as is done by edge computing. A massive amount of getting integrated into any new-age analytics model. +data is collected and processed by edge devices locally, As shown in Fig. 4 an IoT data quality validation layer stores condemnatory data. Edge computing is closer to end can be included in Big-IoT framework immediately after users and provides Quality of Services (QoS) to end users. data collection. Before integrating raw data collected from Edge computing nodes are also called edge/cloudlet servers. IoT devices, a series of transformation and quality checks Edge servers reduce operating cost, provide real-time in the proposed layer would facilitate further analysis of this analysis, reduce network traffic and improve the data. +performance of applications [20]. +C. Data Interpretability – Semantics of IoT Generated Big Data +The three V’s of big data volume, velocity, and variety are inherently applicable to IoT data. Before integrating this data with other non-IoT data for further analytics, high- level abstraction of the raw IoT data can improve the interpretability of the data. IoT requires algorithms that can analyze data that comes from a variety of sources in real- time. Semantic technologies tend to enhance the abstraction of IoT data through annotation algorithms [17]. The “variety” of IoT data encompasses time series data, streaming data, geographical data, data coming from wearable devices, etc. Providing insights based on these raw values requires a plethora of algorithms. Semantic technologies for interoperability on IoT are one of the latest research field in IoT [14], [21]. Due to the heterogeneity of devices and platforms in any big data and IoT framework, augmenting data with semantics that the data represents can add a very high value to the raw data that accumulates with Fig. 4. IoT inclusive quality assurance framework. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +119 +Telfor Journal, Vol. 11, No. 2, 2019. +Data accuracy and consistency, data timeliness and data usability are very important quality attributes and can affect the performance of an analytics application. Ascertaining these attributes for IoT data requires entirely different approaches and methods. Fig. 5 elaborates the difference between the data quality assurance methods with respect to IoT big data and non IoT big data applications for these above-mentioned quality attributes. +Thus, IoT data needs to undergo various transformations before its assimilation into a big data analytics framework. The data quality validation layer proposed in this study aims to encompass the features of IoT data quality listed in Fig. 5. Based on various processes and methods as mentioned transformations on raw IoT data are performed wherever necessary. Seamless implementation of measures discussed with respect to every challenge mentioned in the preceding section would assure the quality of IoT data which is the primary ingredient of any new-age analytics model. An IoT data validation workflow can be designed based on this proposed validation layer to ensure that the data is ready for integration with other data in the big data ecosystem. This validated IoT data can then be integrated with HDFS, HIVE or any other big data framework for further analysis and interpretation. + +Fig. 5. Data quality assurance: IoT Big Data vs Traditional Big data. +VI. CONCLUSION +Data testing is a critically important phase in the development of big data application. IoT is a massive game changer in the modern world where sensors are the heart of IoT and big data. IoT and big data help to connect to devices to generate data to transmit, compile, and run analyses and predict and forecast new future. This paper is an effort to highlight various dimensions of the IoT data quality. The paper also highlights the requirement of a dedicated IoT data pre-processing and validation cycle for IoT data before its integration with other data in Big data IoT paradigm. Authors emphasize a smooth and continuous amalgamation of these additional processes for futuristic IoT big data applications. +REFERENCES +[1] J. Gao, C. Xie and C. Tao, “Big Data Validation and Quality Assurance -- Issuses, Challenges, and Needs,” 2016 IEEE Symposium on Service-Oriented System Engineering (SOSE), Oxford, 2016, pp. 433-441. +[2] N. Elgendy and A. Elragal, “Big Data Analytics: A literature review paper,” P. Pemer (Ed): ICDM 2014, LNA 18557, PP.214-227, 2014. +[3] J. Gao, X. Bai, W. Tsai and T. Uehara, "Testing as a Service (TaaS) on Clouds," 2013 IEEE Seventh International Symposium on Service-Oriented System Engineering, Redwood City, 2013, pp. 212- 223. +[4] E. Ahmed et al., “The role of big data analytics in Internet of Things,” Computer Networks, vol. 129, Part 2, pp. 459-471, 2017. +[5] M. Gudipati, S. Rao, N. D. Mohan and N. K. Gajja, “Big data testing approach to overcome quality challenges,” Infosys publication, vol. 11, pp. 65-72, 2013. +[6] M. Mohammadi, A. Al-Fuqaha, S. Sorour and M. Guizani, “Deep Learning for IoT Big Data and Streaming Analytics: A Survey,” IEEE Communications Surveys & Tutorials, vol. 20, no. 4, pp. 2923- 2960, Fourthquarter 2018. +[7] https://iot-analytics.com/state-of-the-iot-update-q1-q2-2018- number-of-iot-devices-now-7b. +[8] P. Verdugo, J. Salvachiua and G. Huecas, “An agile container-based approach to TaaS,” 2017 56th FITCE Congress, Madrid, 2017, pp. 10-15. +[9] M. Hassanalieragh et al., “Health Monitoring and Management Using Internet-of-Things (IoT) Sensing with Cloud-Based Processing: Opportunities and Challenges,” 2015 IEEE International Conference on Services Computing, New York, NY, 2015, pp. 285- 292. +[10] H. Kim et al., “IoT-TaaS: Towards a Prospective IoT Testing Framework,” in IEEE Access, vol. 6, pp. 15480-15493, 2018. +[11] R. Kumar, et al., “Monitoring system using android App”, ARPN Journal of engineering and applied sciences, vol 12, no 19, pp. 5647- 5652, October 2017. +[12] C. Bekara, “Security Issues and Challenges for the IoT-based Smart Grid,” Procedia Computer Science, vol. 34, pp. 532-537, 2014. +[13] P. Bhardwaj et al., “A review paper on smart home automation”, International Journal of Scientific Research and Management Studies (IJSRMS), vol. 3, no. 6 pp. 246-250, January 2017. +[14] Z. Khan, Z. Pervez, A. G. Abbasi, “Towards a secure service provisioning framework in a Smart city environment,” Future Generation Computer Systems, vol. 77, pp. 112-135, 2017. +[15] M. Sripan, X. X. Lin, P. Petchlorlean and M. Ketcham, “Research and thinking of smart technology,” International conference on the system and electronic engineering, December 18-19, 2012. +[16] R. Saia, “Internet of Entities (IoE): a Blockchain-based Distributed Paradigm to Security,” arXiv:1808.08809v1. +[17] A. Čolaković and M. Hadžialić, “Internet of Things (IoT): A review of enabling technologies, challenges, and open research issues,” Computer Networks, vol. 144, pp. 17-39, 2018. +[18] C. Mankar et al., “Internet of Things (IoT) an Evolution,” International Journal of Computer Science and Mobile Computing, vol. 5, no. 3, pp. 772-775, March 2016. +[19] G. Sabarmathi, R. Chinnaiyan, and V. Ilango, “Big Data Analytics Research Opportunities and ChallengesA Review,” International Journal of Advanced Research in Computer Science and Software Engineering, vol. 6, no. 10, pp. 227-231, October 2016. +[20] W. Yu et al., “A Survey on the Edge Computing for the Internet of Things,” in IEEE Access, vol. 6, pp. 6900-6919, 2018. +[21] C. Maple, “Security and privacy in the internet of things,” Journal of Cyber Policy, vol. 2, no. 2, pp. 155-184, 2017. +[22] S. Pacha, S. R. Murugan and R. Sethukarasi, “Semantic annotation of summarized sensor data stream for effective query processing,” J Supercomput, 2017. +[23] P. Murdock ed., “Semantic Interoperability for the web of Things,” DOI: 10.13140/RG2.2.25758.13122, August 2016. +[24] M. Harlamova, M. Kirikova and K. Sandkuhl. “A Survey on Challenges of Semantics Application in the Internet of Things Domain.” Applied Computer Systems, vol. 21, pp. 13-21, 2017. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt b/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt new file mode 100644 index 0000000..38387bb --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/103-A study of software reliability on big data open source software.txt @@ -0,0 +1,114 @@ +Int J Syst Assur Eng Manag +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Software (OSS). The Open Source Software is now a movement and has seen an exponential growth in spread and depth; riding the wave of phenomenal growth in net- works and internet related technologies. The origin of OSS can be traced back to 1970s, when Richard Matthew Stallman, often known by his initials, RMS propounded the concept of OSS. RMS believed that both software and +& Ranjan Kumar +ranjan301@gmail.com +Subhash Kumar subhashkumar@andc.du.ac.in +Sanjay K. Tiwari tiwari.dr.sanjay@gmail.com +https://doi.org/10.1007/s13198-019-00777-x +ORIGINAL ARTICLE +A study of software reliability on big data open source software +Ranjan Kumar Department of Computer Science, Aryabhatta College +(University of Delhi), Benito Juarez Marg, +software development, intrinsically by their nature belongs to the body of knowledge for the humankind and thus must be shared freely. RMS introduced the free version of the +New Delhi 110021, India + • Subhash Kumar Department of Physics, Acharya Narendra Dev College +(University of Delhi), Govindpuri, Kalkaji, +widely used Unix operating system under GNU (Stallman 1998). Freedom the core concept of OSS, according to RMS was seen as a fundamental component of free speech +New Delhi 110019, India + • Sanjay K. Tiwari Post Graduate Department of Mathematics, Magadh +University, Bodh Gaya, Gaya, Bihar 824234, India +and strongly advocated sharing of the software s code and +123 + + +Received: 9 May 2018/Revised: 10 December 2018 + The Society for Reliability Engineering, Quality and Operations Management (SREQOM), India and The Division of Operation and Maintenance, Lulea University of Technology, Sweden 2019 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Int J Syst Assur Eng Manag +Abstract With the increasing use of Open Source Soft- ware (OSS) in high speed networking, parallel processing and distributed computing, OSS has emerged as main- stream in the last decade and is now being broadly accepted even by the traditional proprietary software development companies. The major advantages of OSS over traditional software development are less development cost, avail- ability of source code, quality and security. Software reli- ability an important attribute of software quality, is defined as the probability that a software will operate free of failures or breakdown for a specified time under speci- fied conditions (IEEE Std. 1633-2016). Investigation of Software reliability with the help of software reliability models (SRM) undertakes the estimation and prediction of the failure phenomenon of a software. In this paper we have investigated whether Non-homogeneous Poisson process (NHPP) based software reliability models fit in the big data open source software fault/bug data. We have extracted real and latest bug/fault data of Hadoop and +Spark open source big data applications, from bug track- ing/management tool Jira. For this purpose, we have also compared these models on different goodness-of-fit and prediction criteria based on collected failure data to ascertain whether a best fitted model can also be a best predictor. It is found that the best model fitting the failure data is not a best predictor model. +Keywords Bug Goodness of fit NHPP OSS 1 Introduction +The last decade has witnessed rapid and profound devel- opment in computer networking and internet related tech- nologies. This has heralded a new dimension to the entire gamut of software development. It has given a decisive impetus to the development of an entirely new ecosystem wherein the development process of software is essentially concurrent and distributed in nature the Open Source + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Int J Syst Assur Eng Manag +the associated idea. The salient attributes of open source software involves possession of certain sacred and free rights viz. right to use, right to reproduce, right to modify and right to distribute the software. It has to be realized that free in this praxis is not synonymous in the economic sense, rather it refers to free as in freedom to do certain acts in the software development process and doing away with restrictions which generally accompany the propriety software. This model of software development results in a more robust and reliable software; which is not only reli- able but also more efficient and productive. This model promotes transparency in projects and thereby minimizes risk in the development process of the software. The phi- losophy and practice of OSS was firmlyestablished by Eric Raymond in his seminal paper The Cathedral and the Bazaar (Raymond 1999). In this essay and later a book Eric Raymond likened the propriety software to the the Cathedral model whereas the OSS development to the Bazaar model and argued that these two models are based on antagonistic assumptions about the nature of the debugging task in software. The process of development of OSS imparts myriads of advantage to its products when compared to the commercial propriety software. The OSS are found to have fewer bugs, have better reliability, are free from vendor s lock-in periods and thus are free from vendor dependence. The OSS possesses better and quick support as they belong to the community rather than to a firm.These products also have educational value. A critical analysis of the claims of the suitability of OSS due to these factors has been taken up (Ven et al. 1998). It has been found out that indeed certain factors like economical products, availability of source code, support by the com- munity, independence from vendor lock-in and maturity of software do put OSS to advantage vis-a‘-vis commercial software. +Having said that, the quality of software remains a prime concern. It is important because it brings out the extent up to which the software meets the user s requirement. Therefore, qualitative and quantitative assessment of the software has attracted a lot of attention. Studies which discern the quality of the software include empirical studies and mathematical modeling. Out of the various tools available for quantitative assessment of software, the exponential model also known as reliability growth model and Software Reliability Model (SRM) are ubiqui- tously utilized. While the exponential model models the appearance of defects at the backend of the development for projecting failure pattern in the field, the SRM fixes a definite probability for the software causing a system failure over some specified operating period. A large body of empirical data supports both of these models. +Software Reliability Model (SRM) has emerged as a key indicator as well as predictor for determining the quality of +software as soon as the software is launched in the market. By definition, SRM is a mathematical expression which provides the generic form for appearance of bug in the software as a function of bug detection, bug correction and the operational environment (Std 1633). SRM is utilized to assess as well as predict reliability of a product. For assessment of reliability SRM seeks to fitthe data extracted for the failure of software using various statistical tech- niques like linear regression or non-linear regression. The choice of technique obviously depends upon the behavior of extracted data. For the purpose of predicting the relia- bility of the software, the expected number of bugs is estimated through fitted SRM (Lyu 1996; Yamada 2014). +The issue of reliability in case of OSS has also received some attention. Several hypotheses have been proposed to investigate the relationship, if any, between reliability and openness (Joode and Bruijne 2006). A study on OSS pro- ject s bug data has however, concluded that the traditional software reliability growth model cannot be applied for the assessment of the reliability growth of OSS because the software development paradigm of an OSS is intrinsically different from proprietary software and further goes on to suggest an alternative approach for assessment of OSS products (Zou and Davis 2008). OSS has been subjected to quality assessment quantitatively using alternative approaches (Tamura and Yamada 2009, 2010; Zhou 2005). Studies on bug tracking data of few popular OSS reveals that the OSS projects as well as closed source projects (CSS) show similar reliability growth pattern (Singh et al. 2010a, b). This has been further confirmed by the Non- homogeneous Poisson process (NHPP) based reliability models wherein similar reliability growth curve have been reported for OSS as well as CSS (Singh et al. 2010c, d). This raises the relevant question that if from a reliability point of view, the OSS behaves in the same way as CSS, then which model is most appropriate for its assessment? The bug detection rate of two OSS projects examined with in house developed software using two SRMs found that the two OSS projects exhibited different profiles of bug arrival behavior (Syed-Mohamad 2008). By analyzing six OSS projects bug data Zhou (2005) found that OSS and CSS projects exhibit a similar pattern of reliability growth. They used general Weibull model to fit bug occurrence of OSS projects. The Weibull distribution has also been also suggested by Rossi (2010) as the best model for OSS by analyzing the bug occurrence behavior of three OSS pro- jects applying SRM. On the contrary, Rahmani (2010) discovered a fundamentally different result by using 3 models and dataset of 5 OSS projects bug data. They found that the Weibull was the worst model. By modeling of the bug reports using nonparametric techniques for the six OSS projects bug data Zou (2008) observed that exponential smoothing methods and Generalized Additive models are +better suited for reliability of OSS products. For reliability classification of OSS products, SRMs can be used suitably (Li et al. 2011). +It is evident that a plethora of models for software reliability is available in the market as well as in the lit- erature. Many of these models are based on Non Homo- geneous Poisson Process (NHPP). In these models, failure process is assumed to follow a non-homogeneous Poisson process. These SRMs generally have an intensity function or the rate of bugs/failures in the software given by a power law polynomial and display a great degree of flexibility in application. For the commercially available traditional software, these NHPP models have been found to be suc- cessful and have been widely utilised for software relia- bility studies. However, it remains to be discerned whether these models for software reliability can also be used gainfully for the same purpose in case of OSS. The aim of the present study is to investigate the suitability of NHPP based SRMs on OSS in general and Big data OSS Spark and Hadoop in particular. The rest of the paper is organised as follows. In Sect. 2, some chosen SRMs which are widely used and are based on NHPP are introduced along with their characteristic functions. These models undergo evaluation or validation in Sect. 3 on two data sets on bugs/failures of two popular Big data OSS Hadoop and Spark. In this section, analysis of the data sets includes parameter estimation for the respective models. This is followed by comparison of models using Goodness-of fit criterion. The analysis also probes the assessment and predicting abilities of these SRMs for the representative datasets of the bugs reported in the chosen big data OSS. Here the criterion of goodness of fit implies how well a model predicts the dataset which has already been utilized to estimate its parameters, while how well a model predicts new data points is said to be its predictive capability i.e., predicting unseen data in future. Section 4, presents the results and interpretation of the analysis carried out in the present investigation. +2 NHPP models +NHPP models considers the number of faults per unit time as an independent Poisson random variable which evolve by a non homogeneous Poisson process (Yamada 2017). NHPP models have been very successful and are amongst the widely applied models for software reliability studies. The reasons behind popularity of NHPP are follows: +(i) These are categorized by a mean value function, m(t), which help in calculating expected number of bugs up to time t very easily. +(ii) Parameters of the model can also be computed very easily. +(iii) NHPP models are closed under time transforma- tion and superposition (Lai and Garg 2012). +Here we consider five well known conventional NHPP models to measure and evaluate them on two well estab- lished big data open source projects viz. Hadoop and Spark. Analysis is carried out to findout (i) whether they fit on them and (ii) whether a best goodness-of-fit model can also be a best predictor model. The five models chosen for present study are briefly described below: +2.1 Goel Okumoto (GO) model (Goel and Okumoto 1979) +It is an exponential NHPP model developed by Goel and Okumoto in 1979. It was proposed on the assumption that whenever a bug is detected, it is corrected in no time and all detected bugs are mutually independent to each other. +2.2 Kapur and Garg (KG) model (Kapur and Garg 1992; Kapur et al. 2011) +The model, proposed by Kapur and Garg in 1992 assumes that during the debugging process some additional errors/faults may also be corrected, while removing the bonafide failures. While the bonafide failures are termed as independent faults, the additionally removed faults are deemed to be dependent faults. +2.3 Yamda delayed S-shaped (YDS) model (Yamada et al. 1983) +Yamda proposed this model in the year 1984 with a modification of NHPP model. It is also considered as generalized exponential model with the assumption that the behavior of bug arrival pattern first increases and then decreases to obtain S-shaped curve. A software bug detection process is described by failure detection process and bug isolation process. +2.4 In ection S-shaped model (ISM) (Ohba and Osaki 1984) +The model was developed by Ohba in 1984 and it is based on the dependency of faults with the assumptions: a) bug detection rate of each bug is constant, b) the isolated fault can be fully removed and some faults cannot be detected before removing some other faults. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +123 +Int J Syst Assur Eng Manag +2.5 Pham—Nordmann—Zhang (PNZ) model (Pham et al. 1999) +This model was proposed by Pham in the year 1999 which considered imperfect debugging situations with the assumption that during debugging new bug can appear with the constant bug detection rate. +The mean value function, mðtÞand intensity function kðtÞare the two characteristic functions which constitutes the building block of all the above models based on NHPP. While mðtÞis the mean value function of the expected number of faults/bugs which have been detected/removed in the time interval [0, t], the failure intensity function +kðtÞ ¼dmðtÞ measures the instantaneous rate of change of +dt +the expected number of failures i.e., mðtÞat time t, given that the system has not failed up to time t. Table 1, enu- merates the characteristic functions of the NHPP models chosen in the present study. Here n is total number of expected fault, f is bug detection rate, c is bug inclusion rate and q represents the dependent bug detection rate. +3 Model evaluation/validation +Once mathematical models have been selected, they are evaluated for its ability to fit the historical failure data of the software i.e., Goodness of fit.Additionally, they need to be further evaluated for their ability to predict occurrences of failures of the software in future i.e., predictive capa- bility. For this purpose, it involves estimation of the unknown parameters of the chosen models. As the NHPP- based software reliability are described by non-linear functions, Non-linear least square (NLLS) and Maximum likelihood estimate (MLE) techniques are used to estimate the unknown parameters for these models on actual data- sets for software failures (Kapur et al. 1999). After esti- mation of the parameters are validated on the given dataset to find out their fitting and predictive capabilities. We have +carried out data analysis on two real datasets of under consideration models using R language which is not only an open source software but also one of the most efficient and popular data analysis tool. +3.1 Data set +Among several open source software related to Big Data, we have selected here two most widely used and estab- lished tools for analyzing big data Hadoop and Spark. Among the repositories of the issues for Hadoop and Spark, the present study focused on only those issues that were declared bug . Other type of issues like improvement , + wish , new feature , task or patch were excluded +so that we could deal exclusively with proper failures. Among the data classified as bugs, we have further filtered it and selected the bugs having status as closed . This means those bugs which have been resolved and verifiedby the reporter have been only considered in the analysis. The dataset was also further processed and cleaned with reso- lution defined something like cannot reproduce , du- plicate , won t fix or others. Table 2 illustrates our choice of data after processing. +Data have been downloaded from issues tracking and management tool Jira s website (Apache Website 2018). Although Hadoop has four components, we have only considered and extracted Hadoop common component s bug data. Total of 406 failures were observed in dataset D1 and 375 failures in D2. Detailed month wise bug detection pattern for Hadoop and Spark are shown in Fig. 1. +3.2 Parameter estimation +For calculation of the estimated bugs it is important to first compute the values of unknown parameters in the mean value function. Parameter estimation is generally done by using two estimation techniques; Non Linear Least Square (NLLS) and Maximum Likelihood Estimate (MLE) (Kapur et al. 2011). Since data is irregular in nature, we have used + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +123 +Int J Syst Assur Eng Manag +Table 1 Summary of NHPP +Model Model name Mean value function m(t) models with mean value +function GO Goel-Okumoto (Goel and Okumoto 1979) mðtÞ ¼n 1 e ft +KG Kapur Garg model (Kapur and Garg 1992) +a 1 eð ðfþqÞtÞ mðtÞ ¼ +1 þ q eð ðfþqÞtÞ +f +YDS Yamda Delayed S-shaped (Yamada et al. 1983) mðtÞ ¼n 1 ð1 þ ftÞe ft ISM Inflection S-shaped (Ohba and Osaki 1984) nð1 e ft Þ +mðtÞ ¼ 1þ ce ft +PNZ Pham PNZ model (Pham et al. 1999) mðtÞ ¼nð1 e 1ftþÞdeð1 ftf Þþcnt +c + +Table 2 Collection of bug data for two OSS +OSS Project Dataset Issue type Status Resolution Period Hadoop Common Spark D1 D2 Bug Bug Closed Closed Fixed Fixed April 2014 to Dec. 2017(45 months) Sept. 2012 to Dec. 2017 (64 months + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +123 +Int J Syst Assur Eng Manag + +Fig. 1 Bug arrival pattern of Hadoop and Spark +the nonlinear function in R to calculate value of estimated parameters. It uses maximum likelihood method. The result of computed estimated value of parameters of dataset D1 and D2 are shown in Tables 3 and 4. +3.3 Comparison criteria of models +For the purpose of comparison among the various NHPP based SRMs considered here vis-a‘-vis their suitability in fitting to the bug data of the two OSS under investigation, the following criteria have been utilised. +3.3.1 Goodness-of-fit criterion +Goodness-of-fit denotes how good does a mathematical model fit to a given data . +3.3.1.1 Akaike information criterion (AIC) AIC is used to select the best model among all those models whose unknown parameters are estimated by maximum-likelihood method. +Table 3 Estimated parameters for dataset D1 + +Model n f c d q GO 417.458 0.1056 KG 401.014 0.064 0.147 YDS 400.238 0.2447 ISM 401.014 0.211 2.295 PNZ 355.58 0.307 0.004 4.806 Table 4 Estimated parameters for dataset D2 + +Model n f c d q GO 287.47 0.058 KG 363.065 0.00012 0.266 YDS 620.95 0.037 ISM 363.065 0.266 2373.89 This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +123 diff --git a/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt b/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt new file mode 100644 index 0000000..d94658c --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/106-Testing_and_Quality_Validation_for_AI_SoftwarePerspectives_Issues_and_Practices.txt @@ -0,0 +1,180 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +SPECIAL SECTION ON INNOVATION AND APPLICATION OF INTELLIGENT PROCESSING OF DATA, INFORMATION AND KNOWLEDGE AS RESOURCES IN EDGE COMPUTING +Received August 9, 2019, accepted August 19, 2019, date of publication August 23, 2019, date of current version September 9, 2019. Digital Object Identifier 10.1109/ACCESS.2019.2937107 +Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +CHUANQI TAO 1,2,3 , JERRY GAO4, AND TIEXIN WANG1,2 +1College of Computer Science and Technology, Nanjing University of Aeronautics and Astronautics, Nanjing 210016, China +2Ministry Key Laboratory for Safety-Critical Software Development and Verication, Nanjing University of Aeronautics and Astronautics, Nanjing 210016, China 3State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210093, China +4Department of Computer Engineering, San José State University, San Jose, CA 95192-01809, USA +Corresponding author: Chuanqi Tao (taochuanqi@nuaa.edu.cn) +This work was supported by the National Key Research and Development Program of China under Grant 2018YFB1003900, in part by the National Natural Science Foundation of China under Grant 61402229 and Grant 61602267, in part by the Collaborative Innovation Center of Novel Software Technology and Industrialization, in part by the Fundamental Research Funds for the Central Universities under Grant NS2019058, and in part by the Open Fund of the State Key Laboratory for Novel Software Technology under Grant KFKT2018B19. +ABSTRACTWith the fast growth of articial intelligence and big data computing technologies, more and moresoftwareservicesystemshavebeendevelopedusingdiversemachinelearningmodelsandtechnologies to make business and intelligent decisions based on their multimedia input to achieve intelligent features, such as image recognition, recommendation, decision making, prediction, etc. Nevertheless, there are increasing quality problems resulting in erroneous testing costs in enterprises and businesses. Existing work seldom discusses how to perform testing and quality validation for AI software. This paper focuses on quality validation for AI software function features. The paper provides our understanding of AI software testing for new features and requirements. In addition, current AI software testing categories are presented and different testing approaches are discussed. Moreover, test quality assessment and criteria analysis are illustrated.Furthermore,apracticalstudyonqualityvalidationforanimagerecognitionsystemisperformed through a metamorphic testing method. Study results show the feasibility and effectiveness of the approach. +INDEX TERMS +AI software quality validation, AI testing, testing AI software. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +I. INTRODUCTION +With the fast advance of big data analytics and AI tech- nologies, numerous AI-based software and applications have been widely accepted and used in people's daily life. AI soft- ware and applications are developed based on state-of-the-art machine learning models and techniques through large-scale data training to implement diverse articial intelligent fea- tures and capabilities. Current AI-based software and appli- cations are classied such as natural language processing systems, object recognition systems, recommendation sys- tems, unman-controlled vehicles and so on. Therefore, how to perform quality validation for AI software becomes a critical concern and research topic from both academic and industrial focuses. According to the report [1], the automa- tion testing market size is expected to grow from USD 8.52 Billion in 2018 to USD 19.27 Billion by 2023, at a Compound Annual Growth Rate (CAGR) of 17.7% dur- +The associate editor coordinating the review of this article and approving it for publication was Honghao Gao. +ing the forecast period (20182023). Based on recent test- ing experiences from industry on AI applications such as intelligent mobile apps, testing AI software has new prob- lems, challenges, and needs due to their special features below. +- Scientic-based development instead of engineering- +based development - Most AI software and applications are developed using scientic approaches based on AI models and training data by data scientists and big data engineers without well-dened AI software engineering process and development methods with clear quality validation require- ments and criteria. +- Limited data training and validation - AI software is +built based on machine learning models and techniques, and trained and validated with limited input data sets under ad- hoc contexts. +- Data-driven learning features - These features provide +static and/or dynamic learning capabilities that affect the under-test software results and actions. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +120164 This work is licensed under a Creative Commons Attribution 4.0 License. For more information, see http://creativecommons.org/licenses/by/4.0/ VOLUME 7, 2019 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +- Uncertainty in system outputs, responses, and decision +makings - Since existing AI-based models are dependent on statistics algorithms, this brings the uncertainty in the outcomes of AI software. +These unique AI software features above cause new dif- culties and challenges in testing and quality validation. Therefore, AI quality validation and assurance becomes a critical concern and a hot research subject. Although there havebeenmanypublishedpapersaddressingdataqualityand qualityassuranceinthepast[2][4], seldomresearchesfocus on validation for AI software from function or feature view. There is an emergent need in current research to quality vali- dation issues and quality assurance solutions for AI software and applications. Testing AI software can be considered as diverse testing activities with the intent of nding AI-based software bugs (errors or other defects), verifying that the AI-based software products are t or use, assuring AI func- tionalfeatures'adequatequalityandAIsoftware'sQoS(qual- ity of system service) parameters [41], [43]. Well-dened quality validation models, methods, techniques, and tools mustbedevelopedandappliedforAI-basedsoftwaretofacil- itate the test activities to achieve well-dened test require- ments and meet pre-selected adequate testing criteria and quality assurance standards. Typical issues of quality assur- anceandvalidationforAIsoftwareandapplicationsarelisted below. +- How to perform quality assurance for big data which +couldbeutilizedastrainingdataortestingdataforintelligent algorithms? +- How to make quality validation for application service, +e.g. what is the precision of the recommendation service? +- How to validate the quality of diverse intelligent algo- +rithmsandmodels,suchasdataminingandmachinelearning methods. +This paper is written to provide our perspective views on AI software (specic to feature or function) testing for quality validation. The paper is organized as follows. Section II discusses the tutorial concepts about AI software testing, including test focuses, features, and requirements. Section III reviews AI-based machine testing, AI software function testing, as well as the existing testing methods potentially-used for AI software validation. Section IV dis- cusses AI software testing quality parameters and evaluation as well as test coverage analysis. Section V presents case studies on an image recognition system using the proposed quality validation approach. The conclusion remarks are in Section VI. +II. UNDERSTANDING AI SOFTWARE TESTING +Why do we need AI software testing? The fast-growing AI software and the popularity of big data-based applications bring new needs and motivations. Numerous current and future software will be built with AI-based features and functions. Existing techniques and tools are not adequate to test AI-based features and functions. There are a lack of well-dened and experience-approved quality validation + +FIGURE 1. The scope of AI software testing. +models and assessment criteria. In addition, there is a lack of AI-based testing methods and solutions for AI software. Thus, the meaning of testing AI software is illustrated in a denition below. +``Testing AI software refers to diverse testing activities for AI-based software/systems. Well-dened quality valida- tion models, methods, techniques, and tools must be devel- oped and applied for AI-based software to facilitate the test activities to achieve well-dened test requirements and meet pre-selected adequate testing criteria and quality assurance standards.'' +Therefore, testing AI features of the software includes different testing activities to nd software errors, verify the performance of software, and assuring quality validation methods need to be developed. The testing goal is to achieve well-dened test requirements, meet pre-dened testing cri- teria, and standards of quality assurance of the under-test AI software. +A. TEST SCOPE AND MAJOR FOCUSES +Since AI software is built with diverse machine learning models and data-driven technologies, the scope of AI soft- ware testing should cover current typically-used intelligent features, such as prediction, recognition, and recommenda- tion. Fig. 1 shows the primary scope of AI software test- ing. Objects (human, animal) related testing such as object identication, recognition, and behavior detection are an important part of AI software testing. Various intelligent applications such as business decision, recommendation and selection [35], [36], [45], intelligent commands and actions, analytics and prediction capability [37], [38], [40], [46], as well as question and answer capability are current key AI testing topics. In addition, with the advance of unmanned vehicles and their potential huge markets, how to perform control validation and healthcare check will be a big chal- lengeforAItestingandqualityvalidation.Moreover,AIsoft- ware usually involves context issues, such as scenario, loca- tion[35],time,andstakeholders,therebycausingnewtesting issues in context identication and classication. The major focuses of AI software testing are summarized as follows. +(a) Testing AI functional features to assure their adequate quality in accuracy, consistency, relevancy, timeliness, cor- rectness, and so on using data-driven and AI approaches. +(b)Testing AI software's quality of system service param- eters based on well-dened quality standards and assessment criteria. These include system performance, reliability, scal- ability, availability, robustness, and security, and etc. +(c) Apply data-driven AI techniques to facilitate AI testing +processes and test automation. +B. NEW TESTING FEATURES AND REQUIREMENT ANALYSIS FOR AI SOFTWARE +As discussed above, AI software and applications have numerous unique testing features such as uncertainty and limited training/test dataset. These unique features bring more interesting quality validation and QoS requirements, challenges, and needs. Based on the recent feedback from engineers at Silicon Valley, how to assure the quality of AI software becomes a critical concern and research subject cur- rently. The primary testing features are presented as follows. +Multiple dimension-based rich media input data with multi-input models. This refers to new testing solutions to deal with multi-dimensional large-scale input data sets (such as numerous image graphs and videos) of AI software. For example, the well-known AI application Seeit1 supports text, graph, voice, and audio with diverse input domains both ofine and online. +Test data set selection from big data pools. This refers to test data selection to address the special testing features of AI software. In traditional software, test data is used for nding software bugs. Nevertheless, in AI software, test data is not just used for functional or program bugs. Bugs or defectsexistedintrainingandlearningmodelsinAIsoftware are also needed to be discovered using specic test data. A typical face recognition application `how old do I look' from Microsoft2 can be tested with thousands of pictures to indicate its correctness and accuracy. However, how to select effectivetestdatatodiscoveritsidenticationproblems,e.g., the accuracy of `how old do I look' is affected by lighting condition or background objects. Furthermore, bugs from models or learning algorithms can be detected with more test data with specic goals. +Knowledge-based AI software features and behaviors This refers to apply the domain-specic knowledge to assist in testing correct and precise AI software features and behav- iors. +Uncertainty of AI software features and behaviors. This refers to how to dene and modeling testing objects in a certain way and obtain testable functions through different test strategies, such as metamorphic testing, mutation testing, and fuzzy testing. +Learning-basedAIsoftwarefeaturesandbehaviors. This referstondingnewtestingapproachestoaddresstheleaning +1https://itunes.apple.com/cn/app/seeit/id721911549?lDen&mtD8 2https://www.how-old.net/ + +FIGURE 2. A sample object model-based AI software. +features of AI software. For instance, the learning capa- bility of AI software is needed to be tested in an evolved environment. +Real-time context-based diverse inputs affecting system outputs, actions, and behaviors. This refers to modeling complex context factors in a real-time instance, and analyze the relationship among diverse contexts, inputs, outputs, and actions. +After identifying the primary AI features, AI function features are analyzed for testing. For each identied feature, AI testing requirements are needed to analyze for future testing. For example, before testing an object of AI software, in order to facilitate function or scenario testing, diverse features are required to classify with a well-dened category. Test models are necessary to represent the diverse features under testing. In general, models can be constructed from different perspectives for AI software, such as a knowledge test model, feature test model, object test model, and data test model. As shown in Fig. 2, features of object relation, object identication, object behavior, object classication, and object context are selected for function testing with diverse sub-features. +In general, AI software needs to be tested at both function and system levels. Test planning, test modeling, test design, and test execution are the indispensable parts of the overall testing process for both AI software and traditional software. Since AI software has special features such as non-oracles, timeliness, and learning capability, here function test quality evaluationisaddedparticularlyasthenalstepofAIsoftware testing process. In this step, different quality parameters are measuredusingthepre-denedqualitymetricsbasedontest- ing result analysis. If the evaluation results are not accepted by stakeholders, the testing step goes to test modeling again for a new testing iteration. +III. AI SOFTWARE QUALITY VALIDATION CATEGORY AND APPROACHES +This section rstly illustrates a category of AI software test- ing, including Turing testing, testing AI software, AI-based software testing and AI-based machine testing. Then several existing and potential approaches to AI software testing will + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +120167 +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +be presented and discussed. Moreover, test quality evaluation and test adequacy analysis are illustrated. +A. TURING TESTING +Turing test was introduced by Turing as the imitation game in 1950 [5], aiming to test a machine's ability to exhibit intelligent behavior equivalent to, or indistinguishable from, that of a human. Turing proposed that a tester would ask the testee freely through some devices (such as a keyboard) in the case where the tester is separated from the testee (one person and one machine). After multiple tests, if more than 30% of the testers are unable to determine whether the testee is ahuman or a machine, then the machine passes the testand isconsideredtohavehumanintelligence.Theturningtesthas been considered as the ``beginning'' of articial intelligence (AI) [6], and it has also become an important concept related to AI system testing. Although the Turing test was designed to advance the development of articial intelligence, it also has several shortcomings [7]. +B. AI SOFTWARE TESTING +In this section, the main focus is on validating AI software functions, external behaviors, and external visibility of QoS usingblack-boxtestingtechniques.Totestsoftwarefunctions and features, engineers could adopt convention black-box approaches to validate software quality. Typical examples include scenario analysis, decision table testing, equivalence partitioning,boundaryvalueanalysis,cause-effectgraph,and so on. +However, AI software testing differs from traditional soft- waretesting,sinceAIapplicationsarecharacterizedbyuncer- tainty and probabilities, dependence on big data, random input/output,difcultyinpredictingallapplicationscenarios, andconstantself-learningfrompastbehavior.Inrecentyears, many studies have worked on researching how to test AI software or systems [7][11]. +Broggi et.al proposed the Public Road Urban Driverless (PROUD) test conducted in Parma from the uni- versity campus to the town center through different scenar- ios such as urban, rural, and highway roads [7]. Similarly, Li et al. [8] indicated the difculties of intelligence tests from four aspects and presented an example of how to design intelligence tests for intelligent vehicles. The authors gave the denition and generation of intelligence test tasks for vehicles to combine the benets of scenario-based test- ing and functionality-based testing approaches based on a semantic relation diagram for driving intelligence proposed in [9]. In addition, the authors applied the parallel learning method to the vehicle intelligent test and proposed a par- allel system framework that combined the real-world and simulation-world for testing [10], [11]. +As discussed above, the process of testing AI functions includes test planning, test modeling, test case generation, testexecution,andtestqualityevaluation.Decisiontabletest- ing design technique determines the different combinations of inputs with their associated outputs and implements the +TABLE 1. A sample traditional scenario analysis on siri. + +business requirements or rules of the system. It is also a represented type of cause-and-effect testing or logical test- ing. Black-box testing is used to test the end-user require- ments [12], [13]. It attempts to uncover the errors in the followingcategories:missingorincorrectfunctions,interface errors, behavior or performance errors, and initialization or termination errors. +Let us take Siri3 from Apple for instance. The functions of Siri based on voice command input are listed as below: received voice commands, convert voice commands into text commands (display entered commands), nd the text response and actions that match the recognized commands, text response, action response. To verify the AI functions of the software, the traditional scenario analysis method is applied to analyze the scenarios of applications and test whether the main functions are implemented correctly from the perspective of the scene. Table 1 shows a description of ve scenarios in testingSiri. +Based on the analyzed results and testing experiences, we conclude that the test cases designed by scenario analysis are practical and effective to validate common features and conditions. However, there are some defects to generate test cases using scenario analysis as follows. +a. As a typical intelligent software application with AI +features, Siri has rich context information. The different test contexts affect the results of testing Siri, such as the back- ground noise, the tester's gender, age, and accent. +However, the traditional scenario analysis does not consider these external conditions for testing. Hence, the designed use cases are incomplete, and the execution results of some test cases failed. +b. Advanced AI software or systems have the ability to +learn from data and experiences. Furthermore, some AI sys- tems even learn from environmental interactions and learn +3https://www.apple.com/siri/ + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +120169 +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +dynamically during interaction with users. Thus, the more time you spend on using Siri, the better it will understand you. Siri achieved this by learning about your accent and some other characteristics of your voice. Therefore, if the sametesterrepeatedlytestsSiriforthesamevoicecommand, its overall recognition of dialects and accents will continue to improve, test results will be also affected. Unfortunately, traditional scenario analysis does not take this into account. +In order to test the voice-command-based AI functions more precisely, we should take different voice testing envi- ronments into account with context factors and modeling multi-dimensional testing space for AI features. Currently, we are working on this in another paper. +C. AI-BASED SOFTWARE TESTING +AI-based software testing refers to the leverage and appli- cations of AI methods and solutions to automatically opti- mize a software testing process in test strategy selection, test generation, test selection and execution, bug detection and analysis, and quality prediction [39], [42], [47]. It includes different testing activities in AI-based software testing. Due to the complexity of AI software and applications, traditional methods and test tools cannot meet the demands of testing these AI systems. Given this, a more effective method to test AI systems is desirable. +To deal with this problem, Souri et al. [14] used an AI-based testing technique named as Multi-Objective Genetic algorithm (MOGA) to reduce the number of test cases for testing web applications yet achieve maximum coverage with reduced cost, time and space. Considering manual testing is a tedious and time-consuming task, and it may also result in insufcient testing being performed and critical defects going unidentied, Straub and Huber [15] proposedanarticialintelligencetestcaseproducer(AITCP) to test articial intelligence system (AIS). AITCP starts from a human-generated test scenario and makes changes to it based upon a modication algorithm such as ant colony opti- mization and genetic approaches. The authors compared the resultsoftheAI-basedmethodandthemanual-basedmethod fortestinganautonomousnavigationcontrolsystembasedon selected four scenarios. The study results show that AITCP can be utilized to effectively test AIS for both surface (two- dimensional) and airborne (three-dimensional) robots. +Although there are many successful studies about the automated generation of test cases, determining whether a program has passed a given test remains largely manual. Langdonetal.[16]proposedtheuseofsearch-basedlearning from existing open-source test suites to automatically gener- ate partially correct test oracles. They argued that mutation testing, n-version computing, and machine learning could be combined to allow automated output checking to catch up with progress on automated input generation. +AI software testing differs from AI-based software testing in diverse views such as test objectives, test focuses, test scope, test coverage as well as test techniques and tools. For example, AI-based testing primarily aims to increase +efciency for a test process, reduce testing costs by reduce human operations, and increase bug detection effectiveness and speed. AI testing aims to provide on-demand testing services for AI software to support software validation and qualityengineeringprocess.AI-basedtestingmajorlyfocuses on test selection, automatic test execution, bug detection and prediction based large-scale testing history data and AI tech- niques. In addition, AI testing needs innovative continuous, timeliness, and currency testing techniques. +D. AI-BASED MACHINE TESTING +AI-based machine learning requires a huge number of inputs as the knowledge and different intelligent algorithms in order to make the right decision. By looking at an example using technologyinunmannedvehicles,therewillbeabasicunder- standing of how machine learning or machine intelligence work. The development of machine intelligence is still far from mimicking the cognitive competence of the human brain. It is still challenging to deal with those data effectively and making a driving decision accurately and quickly [17]. Machine learning sometimes returns an inaccurate prediction basedonthecollectionoftrainingdataandanengineerneeds tomakesomeadjustmentstoavoidsignicantlossesinterms of public safety. +DeepLearningisdesignedtocontinuallyanalyzedatawith a logic structure as mimicking how a human can draw a conclusion. The deep learning needs a huge number of data sets to use input in the algorithms in order to result in a more accurate prediction. For instance, Google's AlphaGo, a sharp intellect and intuition game, learns by itself with- out predened data. It makes a more specic move and becomes the greatest player of all. Deep Learning denes a new paradigm based on data-driven programming. Since Machine Intelligence or Deep Learning depends on the train- ing data, the accuracy and quality of data play a vital role for public safety using machine learning in autonomous vehicles. +Many kinds of research attempt to nd solutions for the current obstacles of Machine Learning Systems. To draw optimal decision making, approaches such as Fault Tree Analysis, Fuzzy Logic, Metaheuristic Algorithm, and Arti- cial Neural Network are developed to test with a huge amount of training data by using different algorithms. How- ever,thesufciencyandversatilityofDeepLearningsystems are based on the accuracy of the test data set. It is dif- cult to provide adequate support due to the accessibility of test data quality issue. The current Deep Learning systems have various vulnerabilities and their system analysis and defect detection are extremely difcult. Unlike traditional software systems, Machine Intelligence does not have a clear controllable logic and understandability since the process to make decisions rely on the training data. The recent study shows two major vulnerabilities in Deep Learning systems: Software quality from the output of Deep Learning alone is notadequate;andFailureinunseenattackseventhoughDeep Learning is immune to known types of attacks [18], [19]. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +120171 +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +Thus, how to make machine intelligent testable is a great challenge for future AI-based machine testing. +E. TYPICAL VALIDATION APPROACHES FOR AI SOFTWARE AI software testing could be performed using the following approaches from different perspectives. +- Classication-based AI software testing, in which classication models for test inputs, contexts, and out- puts and events are set up to ensure the adequate test- ing coverage of diverse input data classes, classied contexts and conditions, and corresponding outputs and classes [20][24]. +- Model-based AI software testing, in which selected intelligentlearningmodelsanddatamodelsareextended to be traceable and testable AI test models to facilitate AIsoftware testingand operationsin qualityassessment of training data and test data. +- Metamorphic (Non-Oracle) testing, in which a property-based software testing technique is used as an effective approach for addressing the test oracle problem and test case generation problem [25][28]. The key element of metamorphic testing (MT) is a set of Metamorphic Relations (MRs), which are necessary features of the target function or algorithm in relation to multiple inputs and their expected outputs. +- Learning-based AI software testing using the crowd- sourced approach, in which selected machine learn- ing models and approaches are used to learn from crowd-sources testers in a service platform [30]. +- Rule-based AI software testing, in which pre-dened expert-based rules are established and used in AI test generation and validation [32], [34]. +Nevertheless, how to utilize the existing traditional or intel- ligent approaches to AI software testing is still a great chal- lenge currently. +F. DATA QUALITY VALIDATION FOR AI-BASED SOFTWARE In recent years, data (such as image and video image) qual- ity assessment has attracted signicant attention. Besides, thequalityofbigimage/videodatasetswithlabeledalsohave an important impact on machine learning algorithms, such as deep learning. Using a deep learning approach to train articialAIprogramsbased onannotatedtrainingdatasetsis +a popular way to develop intelligent software using a super- vised learning approach. With the increasing installation of video cameras in many cities, image data quality assessment is becoming a very hot research topic in computer vision and smart cities. +Thereareanumberofcausesaffectingthequalityofimage data [48], [49], such as sharpness, noise, tone reproduc- tion, contrast, distortion, etc. Thus, the typical image quality factors are listed as accuracy, accessibility, readability and understandability, consistency [44], etc. +According to the recent 2018 IEEE NAVIDA AI City challenge[33],manuallygeneratingannotateddatasetsbased +on image datasets from city street transportation cameras bring diverse data quality issues in a deep learning process. Their case study result clearly indicates that the accuracy and quality of derived AI city transportation programs using a deep learning approach highly depends on the quality of annotated training data sets. Based on their experience report, all of the challenge teams encountered diverse data quality issues in annotated training datasets. And they also discovered the urgent needs in quality validation models, methods, and automatic tools for annotated datasets although there are numerous data validation tools for structure data. Therefore, the key issues of quality assurance for big data applicationsarehowtovalidateunstructureddataqualityand how to validate system quality in terms of various quality factors. +Data quality validation and services in a deep learning processforAIsoftwarehasthreedimensions.Theyareshown as follows. +- Raw data quality checking, which refers to the quality checking process and activities for collected raw data, such as camera-generated images, and videos. The pri- mary objective is to perform raw data cleaning, quality monitoring, and evaluation to ensure high-quality raw data could be collected. +- Training data quality validation, which refers to qual- ity validation processes and activities for manually or semi-automatically generated training data sets, such as annotated data sets. Its objective is to improve the generation of training data quality in a deep learning processtoincreasethetrainingqualityforanunderlying AI software. The typical concerns include: a) training data scope and coverage, b) training data classication, +c) training data quality, and d) training data coverage. +- Test data quality evaluation, which refers to test data quality evaluation based on the validation results of a targeted domain-specic application. For a machine learning application system, the major focus of this task should be facilitating AI system quality problem detection,defectimprovement,trainingqualitycoverage and domain-based knowledge modeling issues for AI systems. +IV. TESTING QUALITY ASSESSMENT AND ADEQUACY ANALYSIS +A. TESTING QUALITY PARAMETERS AND QUALITY ASSESSMENT FOR AI SOFTWARE +Like conventional software quality testing, quality parame- ters such as performance, robustness, security, etc., can be applicable to AI software and applications. In addition to the system quality parameters, we must pay attention to specic quality parameters for AI software functions and features. Samplequalityparametersforimagerecognitionsoftwareare presented as follows. +- Correctness This quality factor reects if the recogni- tion result is true when faced with Boolean recognition + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices + +FIGURE 3. AI software test quality assessment. +items,suchasgender,buyornot,recommendornot,age group, etc. +- Accuracy This reects the accuracy of the recognition result when faced with numerical recognition items, such as age, gender, and color. Different math index can be used to measure it, such as mean difference, variance, standard deviation, distribution interval, con- dence level, absolute mean or relative mean. +- SystemStability Thisreectsthestabilityoftherecog- nitionsystems.Forexample,torecognizethesamething twice or more times, the result should be stable. +- Timeliness This reects some indicators related to time, such as the recognition time, training time, and classify time. +- Recognition Ratio This reects the recognition ratio oftheimagesystem,suchastheperfectrecognitionratio which means the system recognizes the picture well, or recognition ratio which is divided by absolute mean or relative mean. +- System Robustness This parameter indicates the robustnessofthesystem.Forexample,whenperforming special operations on the recognized picture, we need to check whether the system can still recognize it well. The transformation includes overturning, mirror image, enlarging or shrinking, shearing, shear, gray scale, and changing the dpi. +- Image Quality This checks whether the recogni- tion systems can deal with the changing of the quality attribute of image, such as gauss noise, spiced salt noise due to the unreliable network transmission, etc. +Based on the discussed quality parameters above, testing resultsareanalyzedandevaluatedforqualityassessment.For example, there are ve quality factors in the set (QF) here as shown in Fig. 3. As we mentioned, AI software have a number of features (F1,...,Fn), composed of corresponding sub-features(F-s1,..., F-si,..., F-sm). For each measurable feature, we could perform test complexity (TC) analysis. In addition, the quality factors can be measured in terms of pre-dened quality metrics to show their percentage value. Quality Measurement results can be represented using a Radar Chart shown in the left part of Fig. 3. Nevertheless, +those measurement results need to be validated in practice to indicate their effectiveness. +B. AI SOFTWARE TEST ADEQUACY AND COVERAGE When AI software can be operated under different contexts andenvironments,itmustbevalidatedunderdiverseenviron- ments to achieve certain context test criteria for vendors and customers.Thus,engineersneedwell-denedtestcriteriaand an effective test coverage analysis solution. As we discussed in Section II, diverse test models can be constructed and utilized for test coverage analysis. For a knowledge model, AI knowledge test coverage analysis need to be performed; for a feature model, AI features, sub-features, and feature classication need to be analyzed for test coverage; and for a data-based model, data classication, data relation, data format,datarange,etc.,needtobeaddressedfortestcoverage analysis. +V. CASE STUDIES- QUALITY VALIDATION FOR ROBUSTNESS OF AN IMAGE RECOGNITION APPLICATION We performed case studies to indicate the feasibility and effectiveness of the proposed quality validation approach provided in this paper. Here we selected a face recognition system as the study object. We performed a case study on a realistic AI application system- ``Alibaba Cloud Computing Services Facial Age Recognition API'' provided by Alibaba Companyusingthemetamorphictestingmethod.Thebase64 encoding of images is submitted to APIs, and the system returns with the recognition results. The experiment data sets are selected from the wiki_crop.tar in the open face dataset IMDB-WIKI. There are total of 52444 face data, and 10K images are selected randomly as experimental data sets. +A. QUALITY VALIDATION METHOD DESIGN +The designed quality validation method is based on the robustness of the age recognition system: The recognition result is deemed better when the real age and recognition age are closer to each other. Facial age recognition is a commonly-used AI application using diverse machine learn- ing algorithms and pattern recognition strategies. There are existing non-oracle problems and due to the effect of picture quality (such as clarity, lighting, background, and expres- sion), network or other reasons, the robustness of an age recognition system is a basic quality factor in quality assur- ance. Thereby we need to test the robustness of the system. Based on the understanding of facial age recognition system above, we adopt metamorphic testing to validate the quality of the system. We consider the possible situations that may occur in a recognition process, such as image rotation, trans- lation, landscaping, a watermark of a picture, or the distance between face and camera. +In this study, we dened two major metamorphic relations MR1 and MR2. For each metamorphic relation, we dene several sub-relations. For instance, in MR1, we give two sub-relations MR1-1 and MR1-2, i.e., a) recognized age is + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +120173 +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +TABLE 2. Metamorphic relation case partition. + + + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +VOLUME 7, 2019 + C. Tao et al.: Testing and Quality Validation for AI SoftwarePerspectives, Issues, and Practices +stable under the spherical transformation (mirror), and b) recognized age is stable under image rotation. In the study, we veried if the image system under testing satises the dened MRs. The detailed metamorphic relations and their sub-cases are shown in Table 2. The proposed metamorphic relations are illustrated as follows. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +VOLUME 7, 2019 diff --git a/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt b/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt new file mode 100644 index 0000000..3c8dfa9 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/107-Industrial_track_Architecting_railway_KPIs_data_processing_with_Big_Data_technologies.txt @@ -0,0 +1,88 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2019 IEEE International Conference on Big Data (Big Data) +Industrial track: Architecting railway KPIs data processing with Big Data technologies +Alexander Suleykin Peter Panfilov Natalya Bakhtadze +V. A. Trapeznikov Institute of Control School of Business Informatics V. A. Trapeznikov Institute of Control Sciences, National Research University – Higher Sciences, +Russian Academy of Sciences School of Economics Russian Academy of Sciences; Moscow, Russia Moscow, Russia Bauman Moscow State Technical +aless.sull@mail.ru ppanfilov@hse.ru University +Moscow, Russia sung7@yandex.ru + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract — in our conducted research we have built the data processing pipeline for storing railway KPIs data based on Big Data open-source technologies – Apache Hadoop, Kafka, Kafka HDFS Connector, Spark, Airflow and PostgreSQL. Created methodology for data load testing allowed to iteratively perform data load tests with increased data size and evaluate needed cluster software and hardware resources and, finally, detected bottlenecks of solution. As a result of the research we proposed architecture for data processing and storage, gave recommendations on data pipeline optimization. In addition, we calculated approximate cluster machines sizing for current dataset volume for data processing and storage services. +Keywords — Big Data technologies, distributed data processing, Hadoop, Spark, railway KPIs. +I. INTRODUCTION +Nowadays the open-source solutions are becoming more and more popular and Hadoop stack with its already improved Map Reduce data processing engine is one of the most widely used technologies for big data storage. Based on Hortonworks Data Platform stack, it delivers 100% open-source global data management platforms and services so customers can manage the full lifecycle of their data. This stack is widely accepted by many large companies for data processing, storage, analysis and visualization. +At the same time, the complexity of big data processing and analysis is extremely increasing due to data volume growth, data variety, velocity, different data formats of data transmission, integration problems and other data complexities. At this point there is always a difficult task to build a robust, reliable and fault-tolerant data processing and storage framework that could handle big data of various formats and high volume from different data sources and systems. The current research is devoted to the application of big data technologies based on HDP Hadoop stack and its ecosystem to the building of data processing and storage platform for railway roads KPIs. +Performed case study has revealed the applicability of regarded technologies to the building of full data pipeline for data processing and storage for railway KPIs. Selected technologies are Apache Hadoop, YARN, Apache Kafka, Confluent Kafka Connector, Airflow, Apache Spark, PostgreSQL. +The conducted research generated the synthetic load tests based on datasets of real KPI data from one railway company with initial data load and X1, X2, X4, X8 increments on top of initial load. Load tests have shown the software and hardware bottlenecks for regarded datasets KPIs. The result of the work is formulation of bottlenecks of data processing pipeline, recommendations for optimization of pipeline and architectural sizing of machines and used Big Data services for current dataset of railway KPIs data storage and processing. +In this paper, the authors have discussed the railway KPIs from railway transportation operations and data-driven distributed computing perspective. Here, after introduction in section 1, the related works on concepts and requirements of KPI frameworks are discussed in section 2. The way to successful implementation of the distributed computing architecture for the railway KPI framework is described in section 3 with architectural layers detailed description in section 4 and dataset examples from railway industry in section 5, followed by experiments with proposed architecture and test results in sections 6 and 7. Discussions on optimization recommendations and conclusions conclude the paper. +II. RELATED WORK +Key performance indicator (KPI) is a collection of performance measures that an organization or company uses to monitor its performance over time. KPIs are used to determine a progress in achieving strategic and operational goals of a company, and to compare its performance with others within its industrial sector. Setting KPIs requires smart decision on how many indicators to track to determine the success of business. More over, the relevance of the KPIs must be continuously evaluated to ensure their alignment with priorities in business strategy and operations. Industry-specific KPIs have been created in different markets including retail, healthcare, financial services, logistics, manufacturing and supply chain operations, and transportation. +The increasing railway traffic and a corresponding need of railway capacity require a more efficient operation, maintenance and railway asset management by infrastructure managers (IMs). To support railway IMs in decision making process, KPIs are developed so that the results of operation and maintenance activities could be measured and monitored. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +978-1-7281-0858-2/19/$31.00 © 2019 IEEE +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 978-1-7281-0858-2/19/$31.00 ©2019 IEEE 2047 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +In literature, one can find examples of projects on KPIs and benchmarking for railway transport operations and railway infrastructure maintenance [1-7]. +However, KPIs used in railway transportation sector are often ad hoc and seldom standardized. In the course of last decade, several programs were undertaken both at national and international levels to bring a common ground to a multiple efforts in developing KPI platforms for managing railway infrastructure. +In Europe, an increased interoperability and building of a trans-European railway network is one of the goals of the European Union. The required harmonization and standardization of the management of railways have led to increased use of European Standards such as, for example, the European standard; Maintenance key performance indicators (KPIs), EN 15341 [8]. In the paper [9], the authors have proposed performance indicators for railway infrastructure, that have been mapped and compared with indicators of this European standard. +In 2013, a Platform of Rail Infrastructure Managers in Europe (PRIME) was established to assist in implementation of the Single European Rail Area, better deployment of European Rail Traffic Management System (ERTMS), performance benchmarking and exchange of best practice amongst infrastructure managers. PRIME organization plays the role of the European Network of Infrastructure Managers as foreseen in Article 7f of Directive 2012/34/EU establishing a single European railway area, as amended by Directive (EU) 2016/2370. Among the major tasks of the Network there is a task under paragraph (d) “monitor and benchmark performance, including identification of common principles and practices for the monitoring and benchmarking of performance in a consistent manner”, which is carried out by the KPI's and Benchmarking Expert SubGroup. The subgroup is preparing yearly benchmarking reports, including the most recent PRIME KPI Catalogue [10], which contains the indicators agreed by the expert group and their definitions, set out in a structured and prioritised way following the concept of the balanced scorecard. The KPIs have been developed over a three year period and tested in 3 pilot exercises. These KPIs will be fixed for use in the initial Dashboard tool, but it is expected that they will be developed further and improved on a regular basis in the future. +A new challenges that railway KPI implementations might face are associated with the introduction of the international ISO 55000 standard [11] focused on asset management. The ISO 55000 series standard makes asset performance evaluation (APE) an important aspect of the asset management system (ASM) as per international standard ISO 55001:2014 [12]. The ISO 55000 series standard sets the asset management principles for organizations to follow when developing and implementing all of their functions including units and processes. The APE serves to improve the level of the company's assets to achieve the objectives. The asset performance measurement and management (APMM) is a recognized best practice for preparing a strategic road map from top strategic managerial level to the operational level +through a link and effect model [13] for identifying and developing KPIs. +A high level description of the elements of APMM concept can be found in [14], followed by a comprehensive discussion on specific issues and challenges of APMM. Among them, an important new data-driven challenge is ”to define and develop methods for right data collection through condition monitoring and big data management, beside management of knowledge” [14]. +Nowadays, Smart Monitoring and Smart Maintenance (eMaintenance) concepts based on distributed data processing and Big Data platforms are applied for real-time data collection, storage, analysis and decision support. From business objectives prospective, it is important that data collected are linked with KPIs so that they can be analyzed to compare and measure with business strategy and organization. Depending on the business requirements, the KPIs and other indicators can be used for generating composite indicators (CI) [15] for performance benchmarking with the best in the industry, besides verifying the return on investment. Stenström et al, in [15], developed a link and effect model for monitoring and analysis of operation and maintenance performance of rail infrastructure and demonstrated as a case study. +Data collected from smart monitoring systems in commercial and industrial setups are growing rapidly to be very large in volume, high speed in velocity and vast in variety for the data acquisition, storage, processing and analysis. Big data technologies are used for information extraction through pattern recognition and eMaintenance solutions [16, 17]. While the data collection, data quality, processing and analysis for the asset performance under Big Data analytics has taken focal point, performance measures, indicators and key performance indicators (KPIs) dictates which data is needed to be measured and why [18]. +Big Data analytics provides IMs faster and better decisions that were inaccessible before. Nowadays, most companies use business analytics and data-driven reporting tools to automatically track its KPIs. The modern Big Data and distributed computing solutions help companies to collect relevant data from operational systems and create reports on the measured performance levels. Company's executives and managers are obtaining KPI results on business intelligence dashboards or performance scorecards that include diverse linked data visualizations, with the ability to improve understanding of the company's performance data. +To guarantee the business success, KPIs and various issues and challenges of APMM should be considered thorougly. In this paper, we have touched the data-driven challenges of the KPI and APMM frameworks on the basis of our experience in architecting smart monitoring and management systems for mobile network industrial sector [19]. Here we have demonstrated how our expertise in distributed computing and smart data processing can be applied to somewhat similar problem area of railway asset performance monitoring and measuring for establishing railway KPI framework. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 2048 + +III. CORE ARCHITECTURAL COMPONENTS OVERVIEW Integration Layer Storage Layer Serving Layer +We propose to use Lambda architecture as a basement +architectural methodology. Thus, it allows companies to +handle their data in the most reliable and effective manner for +majority of use cases. In our previous work [19] we built +Smart Cellular network monitoring service using Big Data +methods and tools on top of Lambda-driven architecture. The +following picture depicts the key Lambda principles: +Fig. 2. Research data pipeline architectural overview +The definition of used components is according to the table below (Table 2): +TABLE II. CORE COMPONENTS DEFINITION + + Component Definition 1 JBoss Fuse Industrial data bus for solving the integration problems of the entire company [21] 2 Kafka Distributed, fault tolerant, horizontally scalable, productive message broker [22] 3 HDFS Distributed fault tolerant file system optimized for storage for processing large amounts of data [23] 4 Spark Distributed in-memory framework for high-load data processing [24] 5 PostgreSQL Relational database to provide BI data to tools [25] 6 AirFlow Universal Scheduler [26] Fig. 1. Lambda architecture overview +It’s widely assumed to highlight the following layers (Table 1): +TABLE I. ARCHITECTURAL COMPONENTS OVERVIEW + + Component Purpose 1 New data New data sources 2 Batch layer A layer of a full data set optimized for batch calculations. The role model is applied only at the level of subject areas (directories) and storing objects 3 Serving layer Provides fast (including random) access to structured data for consumers. Data should already be all designed for Batch Layer. A role model is applied with the possibility of limitation to objects (tables), attributes / indicators (columns) and rows 4 Speed layer Speed layer Designed for streaming data processing and providing access to the most relevant data, i.e. data that has not yet been recounted by the Batch Layer, but has already appeared in the system. The Speed Layer looks only at recent data without access to history, while the Batch Layer looks at the entire data history. Not all indicators can be calculated on this layer 5 Query Queries from external BI systems Data transfer from Kafka to HDFS is implemented using Confluent open source solution – Kafka HDFS Sink Connector [9]. +IV. ARCHITECTURAL LAYERS DESCRIPTION AND DEFINITION +In our research Storage Layer and Serving Layer have their own Layers (sublayers), which are used for methodological correctness of data load. The data pipeline of the whole data movement is strict and should go through the following sublayers inside Serving and Storage Layers: +Data Storage Layer Serving Layer +As a Lambda-based driven architecture we have used the following architectural components in our research (fig. 2): +Fig. 3. The Workflow data pipeline and layers interconnection +The next table shows the definition and description of each used sublayer: +TABLE III. DESCRIPTION AND DEFINITION OF SELECTED SUBLAYERS +Detail Data Store DDS Postgre The layer of the current data slice presented in a relational form. Re-keying (generation of internal storage IDs). Conversion from object to relational storage. Normalizati on of data (if necessary). Creating a single data model (without unification) Storing a current data slice Data Mart DM Postgre Groups showcases by a specific attribute, most often the subject area. +Contains unified detailed data. +It contains calculated indicators for use in reporting. +Calculation of indicators used in several reports is necessarily submitted to this layer. Data unification. Denormaliza tion of data. Data Aggregation. Calculation of derived indicators used in several places. Report Layer REP Postgre The final reporting layer. From it, data are used only for display in BI tools. It is forbidden to build some reports on the basis of others. Only with the transfer of the information used in the DM layer. Calculation of indicators specific to specific reporting. +It can be both logical and physical. Calculation of derived indicators specific to a particular report. Export Layer EXP Postgre For each data consumer, a scheme is created in which objects are placed for load. The circuit performs almost the same functions as REP Name Abbr eviati on Location Definition and functions Transforma tions Staging Buffer Area STG/ BUF HDFS The area of temporary data accumulation in the format corresponding to the source without any transformations. +Streaming data comes from sources. No Staging Exchange Area STG/ EXC H HDFS The intermediate region for forming the next ETL processing packet. +All accumulated data are moved from the buffer to form a data processing packet. +It is assigned a unique BATCH_ID. BATCH_ID StagingA rchive Zone STG/ ARC H HDFS Storage of the complete archive of incoming messages without transformation of the storage format. +Incoming messages are archived after successful processing. Archiving and enlarging storage files. Operatio nal Data Store ODS/ HIST HDFS The area in which the source data scheme is stored, but they are reduced to a single binary form of storage. It contains the entire history of changes and deletions. Convert to binary storage format. Conversion from object to relational storage. Batch View ODS/ BW HDFS It contains only an actual slice of the state of objects without a change history and deleted records. Calculation of the actual data slice. Detail Data Store Staging DDS_ STG Postgre Batch layer. A separate instance is created for each source system. One-to-one data is transferred from HDP and stored only between downloads. Both full data load and only line changes (deltas) can come. Detail Data Store Logic DDS_ LGC Postgre Layer of transformation logic. Contains data transformation procedures before writing to DDS. V. RAILWAYS KPIS DATA DESCRIPTION +The conducted research has been performed using Key Performance Indicators (KPIs) data from one railway company. The data are represented by usual star schema which means that there is one fact table (main table with events – KPIs) and others are dictionaries. The data are corresponded to the 3-rd level of normal form. +The entities description and data types are the following (Table 4): +TABLE IV. RAILWAY KPI DATA DESCRIPTION AND IT TYPES + +Entity Attribute Data type Description DATA_T YPE ID INTEGER Dictionary – type of data for KPI. Can be approved or planned NAME CHAR DATE_T ID INTEGER Dictionary – type of date +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on June 20,2024 at 17:38:49 UTC from IEEE Xplore. Restrictions apply. 2052 diff --git a/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt b/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt new file mode 100644 index 0000000..a41cb30 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/108 - Foundations of Data Quality Assurance for IoT-based Smart Applications 0.0.txt @@ -0,0 +1,178 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ + +See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/337256634 +Foundations of Data Quality Assurance for IoT-based Smart Applications +Conference Paper · November 2019 +DOI: 10.1109/LATINCOM48065.2019.8937930 +CITATIONS READS +11 332 +4 authors: + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Rodrigo Togneri +Escola de Administração de Empresas de São Paulo da Fundação Getulio Vargas 6 PUBLICATIONS 96 CITATIONS +SEE PROFILE +Juha-Pekka Soininen +VTT Technical Research Centre of Finland 108 PUBLICATIONS 3,160 CITATIONS +SEE PROFILE +Gláuber Camponogara University of São Paulo +12 PUBLICATIONS 182 CITATIONS +SEE PROFILE +Carlos Alberto Kamienski Universidade Federal do ABC (UFABC) +218 PUBLICATIONS 2,215 CITATIONS +SEE PROFILE + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +All content following this page was uploaded by Carlos Alberto Kamienski on 15 February 2020. +The user has requested enhancement of the downloaded file. +Foundations of Data Quality Assurance +for IoT-based Smart Applications +Rodrigo Togneri +, Glauber Camponogara http://swamp-project.org/ 5 Antifragility is a property of systems that increase in capability to thrive as a +, Juha-Pekka Soininen https://agrosmart.com.br/en/ result of stressors, shocks, volatility, noise, mistakes, faults, attacks, or failures +, Carlos Kamienski1 +rodrigo.togneri@ufabc.edu.br, glauber@agrosmart.com.br, juha-pekka.soininen@vtt.fi, cak@ufabc.edu.br 1Federal University of ABC, Santo André / Brazil +2Agrosmart, Campinas / Brazil +3VTT Technical Research Centre of Finland, Oulu / Finland + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract — Most current scientific and industrial efforts in IoT are geared towards building integrated platforms to finally realize its potential in commercial scale applications. The IoT and Big Data contemporary context brings a number of challenges, such as providing quality assurance (defined by availability and veracity) for sensor data. Traditional signal processing approaches are no longer sufficient, requiring combined approaches in both architectural and analytical layers. This paper proposes a discussion on the adequate foundations of a new general approach aimed at increasing robustness and antifragility of IoT-based smart applications. In addition, it shows results of preliminary experiments with real data in the context of precision irrigation using multivariate methods to identify relevant situations, such as sensor failures and the mismatch of contextual sensor information due to different spatial granularities capture. Our results provide initial indications of the adequacy of the proposed framework. +Index Terms— Data quality, internet of things, smart applications, precision irrigation. +I. INTRODUCTION +Nowadays, the Internet of Things (IoT) is increasingly leaving the state of an idea and landing its technology in its first practical projects worldwide. Proof of this evolution is the recent emergence of a series of research and commercial initiatives in the development of complete technological platforms that integrate IoT to the applications. Only in precision agriculture, IOF20201 and SWAMP2 [1], and Agrosmart3 and Agricolus https://www.agricolus.com/ [7]. + are important scientific and commercial initiatives, respectively. The technical and application challenges are enormous since these platforms enable complex real-time control systems that combine the use of communication infrastructure, hardware, software, analytical techniques and application knowledge combined into multiple layers. +Within the context of current challenges, this paper addresses the fundamental issue of input data quality. In any IoT-based smart application, the output is highly dependent on the data captured by field sensors. Dealing with the lack of data availability and veracity can be synthetized by the acronym GIGO (Garbage-In, Garbage-Out). In other words, however +sophisticated smart application models and algorithms are, poor quality input data will result in poor recommendations. +The solution to this challenge is to increase the smart application data sensing robustness and antifragility 5. The +straightforward benefit is that robust and antifragile sensing allows the system analytical core input data to be as good as possible. As a result, more reliable decisions are made, generating real value gains for applications and thus helping to maximize the end-user confidence in new technologies. +Within the strategic objective of realizing the benefits of this general solution, this paper brings two main contributions: +• The Foundations for a Data Quality Assurance Framework, as a new general vision to increase robustness and antifragility of sensing. Through the composition of complementary approaches, both traditional and cutting- edge ones, the proposed vision is of general use in IoT- based smart applications, although examples here represent the context of precision irrigation. +• Preliminary Findings with Real Precision Irrigation IoT Data that corroborate with the data quality assurance vision. Preliminary experiments were undertaken using raw sensor data provided by our partner Agrosmart, which raised some initial interesting insights in the automatic identification of data quality problems, diagnosis and treatment. For example, the use of multivariate methods has helped us to identify specific sensor failures and the mismatch of contextual sensor information due to different spatial granularities capture. These results corroborate to part of the proposed vision, particularly related to the anomaly multivariate techniques to process IoT data from multiple sources as a way to implicitly aggregate the application context. +In the remainder of this paper, Section II brings related work, Section III explains the foundations of the proposed data quality assurance vision, Section IV develops preliminary experiments with real data, Section V presents and discusses the key results of the preliminary experiments, and finally Section VI draws some conclusions. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +1 https://www.iof2020.eu/ + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +II. RELATED WORK +Karkouch et al. presented an overview of the main approaches to data quality in IoT, and the main contributions were the proposition of data quality dimensions and its categories, the systematic analysis of problems and the suggestion of techniques for the treatment thereof [2]. Our work complements it introducing the antifragility concept, valuing multivariate analytical techniques as links between data and its semantics in the application context, and considering also the influence of IoT architecture on data quality. +Banerjee and Shet realized the importance of addressing the data quality problem in architectural and analytical layers, although kept the discussion at a higher level [3]. Our work completes that discussion by introducing more practical elements towards IoT platforms. Dou and Nan worked specifically on the architectural question seeking to determine the optimization of sensor distribution layout and connectivity [4], although without fitting it into the broader context of data quality. +Liu et al. discussed data veracity problems and solutions, while this paper seeks to integrate data availability and veracity issues in a single approach [5]. +Sanyal and Zhang presented a compelling solution to the IoT data veracity issue through unsupervised estimation methods that replaced low statistical confidence data [6]. Our work complements it by providing a more sophisticated anomaly detection and classification approach that do not make use of estimation methods, providing a more reliable dataset (without disregarding anomalous but dependable data points – disregarded by estimation methods [7]). Vilenski et al. proposed to use multivariate techniques in detecting anomalies in agriculture [9]. Our work goes further proposing a more generalist approach, although our practical experiments are also in agriculture. +OGC http://www.opengeospatial.org + (Open Geospatial Consortium) developed open standards for IoT applications, providing two standards suitable for data quality solutions, namely UncertML (Uncertainty Markup Language) and QualityML (Quality Markup Language). This work is in accordance with these standards and intends to contribute with them when the vision proposed here is deployed as a functional framework. +III. FOUNDATIONS OF DATA QUALITY ASSURANCE +A. Data Quality Issues: Availability and Veracity +Data availability and veracity are key issues in IoT operations. The former is straightforward, i.e., if there are no stimuli coming from sensors, there is no reaction. And, the latter because if the sensor stimuli are relevantly inaccurate, the reactions may be inappropriate or even harmful. We want to maximize data availability, and within available data, we want to maximize their veracity. +Possible types of IoT data quality issues can be divided into availability and veracity problems. Data availability problems include: +• Error Data: Occurs when the sensors data capture system identifies a known problem, emitting a specific signal to it. The data is clearly invalid, and as it is easily identifiable, it must be converted into missing values. As a result, data becomes unavailable. +• Data Interruption: Occurs when a sensor data does not reach its reader. Regardless of the cause, data also becomes unavailable. +Data veracity problems include: +• Unbalanced Data: Occurs when sensor data is emitted and captured, but this data is not reliable to the measured phenomenon. Data is available but is not dependable. +• Non-Correspondence of Different Granularity Data: Occurs when there are valid sensor data, although there is a mismatch between different sources due to different space or time granularities of the sensing system. +B. Increasing Sensing Robustness and Antifragility +Data quality assurance can be achieved by acting on both architectural and analytical layers [3]. Fig. 1 shows the big picture of how these layers are placed in an IoT-based smart applications data flow. The Data Quality Assurance Framework is the phase coming right before Information Processing, which is the system core analytical task. + +Fig. 1: Data Quality Assurance Framework as a Data Transforming / Influencing Agent Through IoT-Based Automated Systems Data Flow. +Taleb [7] provided an important contribution to risk management by stating that robustness is not the opposite of fragility, introducing the concept of antifragility and making it easier for systems to be built to evolve with exposure to its environment. Since then, many engineering areas have been using advanced analytical techniques in the search for systems evolution [9] [10]. Taleb introduced a sensitivity scale of things to the environment instabilities (Fig. 2): at one extreme is the concept of fragility, in which things are harmed by instability; in an intermediate position is the concept of robustness, in which things are invariant to instability (do not harm or benefit); at the other extreme is the concept of antifragility, in which things benefit from instability and become better, i.e. things that increase in capability to thrive as a result of + +Fig. 2: Taleb Scale and Correspondence with Data Quality Assurance Effort Types (Architectural or Analytical). +stressors, shocks, volatility, noise, mistakes, faults, attacks, or failures [7]. +As environment instabilities usually bring new and unknown circumstances that cannot be managed by supervised machine learning [7], the antifragility vision states that these techniques should be underprivileged in relation to unsupervised and reinforcement machine learning, which are more adequate to really learn the unknown. Consequently, this is our first suggestion for an IoT data quality assurance framework. +Thus, between the two data quality assurance layers, although the architectural plays an important role, the one that has the greatest potential to flexibilize towards antifragility is the analytical, because it can evolve action rules over time by means of experiencing the data (machine learning). The more data and the more instabilities, the more the system learns and improves. +1) Analytical Layer Approaches +In the past, sensors were preferably subject of electric and electronic engineering, due to their use in equipment of highly specific and local applications. Data treatment was fully performed by signal processing techniques based on mathematical filters for eliminating noise, and keeping only the signal (relevant data) of individual sensors. +On the other hand, in the current IoT and Big Data era, data is becoming more complex and is directly linked to its meanings in smart applications: many dimensions, of different types, with nontrivial relationships among each other - nonlinearities, lag effects - and used in decisions in social environments or others of equal sensitivity. For example, in precision irrigation, a series of meteorological, soil moisture and crop growing stage data can be collected as input to water need estimation, and the relationship among these variables can be considered of high-complexity [11]. Asymmetries of soil moisture behavior also occur as their value, soil depths and the time varies. There is still a data type variety: while most data are series of quantitative variables, others of great relevance as georeferenced images are of semi or non-structured nature, mixing quantitative and qualitative values. +The complexity is not only in the nature of data but also from the data collection architecture, since sensors are sparsely spread on the space (they often have geo-referential characterization), have different periodicities and deal with fault tolerance concepts. +Thus, the traditional signal processing approach is no longer sufficient, requiring an evolution that here we call Signal Processing 2.0, which is an IoT adaptable data flow +based on multivariate unsupervised and reinforcement machine learning techniques. In this context, the analytical layer of our data quality assurance vision aims at bridging this gap. Further, the current scenario requires data treatment to be the target of the most powerful arsenal of machine learning techniques. + Fig. 3 synthetizes the data treatment flow in the analytical layer at a higher level. Also, flows differ depending on the type of data problems. The four steps of the analytical layer are: +Fig. 3: Macro-flow of Data Quality Assurance in the Analytical-layer. +a) Anomaly Detection +Data veracity problems cannot be easily identified because data belong to the expected domain range, and for this reason it is customary to use data mining techniques [12]. In this sense, the techniques of anomaly detection [13] [14] propose to identify out of context values and sometimes classify it. In the traditional signal processing realm, univariate applications (a single signal) are more common. However, in the more modern context of IoT and Big Data, multivariate techniques, the ones that consider the relationship among multiple data sources, gained a lot of attention due to their ability to identify anomalies inaccessible to univariate techniques. +b) Determining the Validity of Anomalous Values +A data point being anomalous does not mean that it is also invalid. It may simply be caused by the occurrence of a rare but real event, which obviously must be regarded as a valid point. At this step, therefore, one must seek for: i) automatic separation of valid from invalid anomalous points, through comparison with theoretical or empirical models [15], or using anomaly detection techniques; and ii) in case of an invalid point, if possible, define which variables are the cause of the anomalous effect, for discarding only data from the offending variable). This step is difficult to replicate for different applications, as it relies on domain specific knowledge (i.e. +theoretical or empirical models). +c) Assigning Missing Values to Invalid Values +Invalid values should not be used in analytical applications for preventing harmful results. This is the easiest step, and since the invalid values have already been identified, the only task here is to replace invalid by missing values. +d) Data Reconstruction +The previous step gives us a more reliable dataset. In this step, missing values are reconstructed from valid ones using +different techniques such as estimation methods [16]. When time series anomaly detection techniques [13] are adequate, or when there were incomplete original cases (which were therefore not considered in some anomaly detection approach), the reconstructed data come back to the anomaly detection step. +2) Architectural Layer Approaches +The architectural layer, encompassing elements as diverse as hardware / software development and data capture and communication solutions, naturally has a myriad of possible approaches. Here we emphasize higher-level architectural aspects that are key to sensing robustness and antifragility. +Fig. 4 synthetizes the influence map of the architectural layer in the system. It highlights the two main practical approaches: (a) use of sensors grid [17] and, (b) use of image-based sensors (drones, satellites) [18]. Both allow a lower granularity of physical space, potentiating contextual spatial knowledge, also impacting the analytical layer by using spatial statistics techniques, with positive consequences in the system antifragility. + +Fig. 4: Map of Influence of the Architectural Layer on the Analytical Layer of the Data Quality Assurance Framework. +The use of sensors grid naturally brings an additional gain of robustness, because the sensors are physically distributed and a fault in one can be covered by a estimative from others nearby. Conversely, the gain in robustness is not natural in the use of image-based sensors, because sensors are concentrated in a single equipment (drone or satellite), and, in case of a failure, all the space points are lost simultaneously. This is known as SPOF (Single Point Of Failure) problem, which can be dealt with redundant equipment. +IV. PRELIMINARY EXPERIMENTS WITH REAL DATA +We performed preliminary experiments with real data from the precision irrigation domain, which provides evidence of the potential of using our vision for data quality assurance. Specifically, these experiments work within the scope of the anomaly detection step of the analytical layer and demonstrate the value of multivariate approaches. +A. Agrosmart and the Dataset +Agrosmart is a Brazilian company that provides crop intelligence services, using a proprietary IoT platform and application of advanced analytical techniques. It provided raw data for this study, from operations of five farms with soybeans crop for a period of approximately 2 years, starting in the first +half of 2016 (depending on the beginning of each culture cycle) until the end of August 2018. Each farm has 1 to 5 management zones, the internal spatial components of a farm, divided usually by soil characteristics. +This dataset contains sensor data, such as7: a) for the spatial granularity of the whole farm: air temperature ℃ , soil temperature (at 40 cm deep) ℃ , global solar radiation +/ , air relative humidity [%], wind speed / , wind direction ° and atmospheric precipitation (rainfall) ; b) for the spatial granularity of the management zone (with a single sensor probe): soil water tension8 (at 20, 40 and 60 cm +deep) , irrigation management , and, in some cases, atmospheric precipitation . The temporal granularity of the raw data ranges between 5 and 30 minutes, depending on the variable and the farm or management zone. Further details are omitted due to confidentiality issues. +B. Approach +When considering the anomaly detection step, the most important aspect is if multivariate approaches are useful to detect veracity problems. In order to simplify the results, only two variables are considered: atmospheric precipitation (farm) and soil water tension at 20 cm deep9 (management zone), aggregated by day. From the raw variables, we derived new ones, due to their semantics in the agriculture context: +• Previous Soil Water Tension 20cm-deep : Soil water tension measured at 20 cm depth at the very beginning of the reference date (management zone). +• 1-Day-Delta (Soil Water Tension 20cm-deep) : Variation value of soil water tension 20cm-deep at the reference date. +• 1-Day-Precipitation : The total precipitation occurred at the reference date (farm). +We used LOF (Local Outlier Factor algorithm) [19] [14], one of the most successful anomaly detection techniques for modern Big Data environments. LOF is a multidimensional anomaly detection technique based on KNN10 for computing spatial density and providing a real numerical value (of domain 0, ∞ ) for each data point: the closer to 1, the more a certain point is similar to its neighbors, indicating that this point belongs to a cluster of points sharing a common behavior. On the other hand, the more distant from 1, the more unusual is the behavior of that point, which becomes an anomaly candidate. +For this experiment, data was cleaned from obviously invalid values (error data or domain outside values) And data was not reconstructed (i.e., data with missing values), as it is a simplified experiment. The presence of missing values makes that LOF is only applied in data points with non-missing values in all the considered variables. +7 All measurements are taken as recommended by [16]. +8 Pressure that the plant needs to exert to consume soil water. 0 kPa indicates extreme ease and 200 kPa represents a severe condition to plant. +9 At this depth the response to water intake is immediate. +10 In KNN (K Nearest Neighbor) algorithm, we used K = 15, arbitrated in response to the parameter stability criterion established in [14]. +V. RESULTS AND DISCUSSION +LOF generated approximately the same results for all management zones and farms, so that, without loss of generalization, only the results of one management zone of one farm is presented. Fig. 5 depicts the scatter plot of the 3 derived variables. Filled circles denote a behavior considered common by LOF Considered cut-off value: 4. +, whereas points in other shapes represent anomalous behavior: +• Red triangle: The soil is previously dry (close to 200 , sensor ceiling value), with no relevant precipitation, although an extreme jump of water availability is observed in the soil, which is highly unexpected. +• Blue cross: Unusual soil drying jumps, when the expected behavior is a smoother drying process, even for days with no precipitation. +• Purple star: Extreme cases of the blue crosses, where soil water availability is high (values close to 0 ), but the + +Fig. 5: Indication of Anomalous Points in the Data of One of the Management Zones and Farms - Scatter Plot Version. +soil dried completely (values close to 200 ) in only one day, a highly unexpected phenomenon. +Fig. 6 complements the analysis of Fig. 5 showing results in a timeline. We can see that red triangles are usually preceded by points with an opposite movement (purple stars and blue crosses), and between them we usually see points characterized by a yellow band, which are sequential points without any variation of values in the soil sensor (a time series anomaly behavior itself). By the domain knowledge, we know this pattern means soil sensor malfunction. However, we could infer that conclusion only by observing these rare events together (anomaly convergence). It is a clear example of how multivariate techniques and the convergence (in space or time) of multiple anomalies can identify real problems, and consequently differentiate them from rare but real phenomena. In other words, it is a way to use domain knowledge implicitly. +The blue crosses are harder to have their veracity determined only by Fig. 5, since their behavior is not as extreme as the purple stars and red triangles. However, Fig. 6 highlights that when they have similar patterns, almost glued to a yellow band, it suggests that also indicate a failure. One time more, there is an anomaly convergence indicating a failure. +Other challenging case is the last red triangle point at the end of January 2017, because it is within the acceptable range of the three variables. However, it is in a marginalized condition according to the joint behavior, something that only a multivariate technique can capture. This happens when there was no precipitation but a significant increase in soil water tension was observed. Such abnormal behavior may have occurred either by a sensor data distortion (precipitation may have occurred without being captured in data) or by non- correspondence of different granularity data (Section III-A). The latter is the most likely reason, since the soil data is from the management zone and the precipitation data is from the farm. Sensor problems are also less likely to have happened in this case because the sequential points are of common behavior (the red triangle in question is a single anomaly among common ones). Thus, this is an example where the non- correspondence of different granularity data can insert invalid +data even though each sensor is emitting valid values. +Also in Fig. 6, most highlighted anomalous points occur in the off-season period (crop interval time), which makes sense, since the sensors can be in preventive maintenance or even are not being monitored because they are not in use anyway. However, other anomalous points (such as the last red triangle point) occurred during the crop period, when usually expressive anomalies are less frequent, making the detection more difficult. In all cases, the anomaly detection experiment revealed interesting results, identifying both expressive and subtle anomalies, in both off-season and season periods. Even in a simple experiment with few variables and a single technique, it provided a preliminary validation of our data quality assurance framework vision, showing that future work is welcome to improve it. + +Fig. 6: Indication of Anomalous Points in the Data of One of the Management Zones and Farms – Time Series Version. +VI. CONCLUSION +In response to the gap in the IoT literature in data quality, this paper proposes a new data quality assurance framework vision as a new approach to address the key practical challenges imposed by the new IoT platforms in the context of Big Data. +Real data of precision irrigation operations were used in preliminary experiments seeking to find some evidence of the adequacy of some of the key elements proposed in the framework. In this case it was the importance that unsupervised multivariate criteria, such as LOF, can play in the process, mainly helping to identify, validate and interpret anomalous values within the larger objective of guaranteeing data veracity. Most of the identified failures in the experiment were not identifiable by normal signal processing approaches, but only by the joint of multivariate criteria (anomalies were subtle, in multivariate context) and of the anomaly convergence phenomenon (in some cases, it even replaced specific domain knowledge need). We have observed that, in identifying valid and invalid anomalies, of expressive or more subtle detection, the experiments could be considered successful in encouraging new ones in a more complete version of the proposed vision, as a functional framework. +A straightforward next step is to deepen the experiments and analysis with real data, by comparing several techniques of anomaly detection, veracity criteria and data reconstruction as +well as the establishment of a feature engineering process for the capture of asymmetries and time effects among the variables. +REFERENCES +[1] C. Kamienski, J.-P. Soininen, M. Taumberger, R. Dantas, A. Toscano, T. Salmon Cinotti, R. F. Maia and A. Torre Neto, "Smart Water Management Platform: IoT-Based Precision Irrigation for Agriculture," Sensors 2019, vol. 19, p. 276, 2019. +[2] A. Karkouch, H. Mousannif, H. Al Moatassime and T. Noel, "Data Quality in Internet of Things: A State-of-the-Art Survey," Journal of Network and Computer Applications, vol. 73, pp. 57-81, September 2016. +[3] T. Banerjee and A. Shet, "IoT Quality Control for Data and Application Needs," IEEE Intelligent Systems, vol. 32, no. 2, April 2017. +[4] R. Dou and G. Nan, "Optimizing Sensor Network Coverage and Regional Connectivity in Industrial IoT Systems," IEEE Systems Journal, vol. 11, no. 3, September 2017. +[5] X. Liu, S. Tamminen, X. Su, P. Siirtola, J. Röning, J. Riekki, J. Kiljander and S. J.-P., "Enhancing Veracity of IoT Generated Big Data in Decision Making," IEEE International Conference on Pervasive Computing and Communications Workshops (PerCom Workshops), 2018. +[6] S. Sanyal and P. Zhang, "Improving Quality of Data: IoT Data Aggregation Using Device to Device Communications," IEEE Access, vol. 6, November 2018. +[7] N. N. Taleb, Antifragile: Things That Gain From Disorder, Random House Incorporated, 2012. +[8] E. Vilenski, P. Bak and J. D. Rosenblatt, "Multivariate Anomaly Detection for Ensuring Data Quality of Dendrometer Sensor Networks," Computers and Electronics in Agriculture, vol. 162, pp. 412 - 421, 2019. +[9] M. Lichtman, M. T. Vondal, T. C. Clancy and J. H. Reed, "Antifragile Communications," IEEE Systems Journal, vol. 12, no. 1, March 2018. +[10] M. Monperrus, Towards Antifragile Software: Knowledge-driven Perturbation of Software Systems with Active Learning, P Preux, 2016. +[11] R. Allen, L. Pereira, D. Raes and M. Smith, "Crop Evapotranspiration- Guidelines for Computing Crop Water," FAO Irrigation and Drainage Paper 56, FAO, 1998. +[12] V. Pendyala, Veracity of Big Data: Machine Learning and Other Approaches to Verifying Truthfulness, Apress Berkely, 2018. +[13] V. Chandola, A. Banerjee and V. Kumar, "Anomaly Detection: A Survey," ACM Computing Surveys, September 2009. +[14] L. Cao, C. Kuhlman and E. Rundesteiner, "Distributed Local Outlier Detection in Big Data," Conference Paper, August 2017. +[15] L. Berti-Équille and J. Borge-Holthoefer, Veracity of Data: From Truth Discovery Computation Algorithms to Models of Misinformation Dynamics, Morgan & Claypool Publishers, 2018. +[16] C. Crocetta, Theoretical and Applied Statistics, Treviso: Springer, 2015. +[17] A.-u. Rehman, A. Z. Abbasi, N. Islam and Z. A. Shaikh, "A Review of Wireless Sensors and Networks' Applications in Agriculture," Computer Standards & Interfaces, vol. 36, no. 2, pp. 263-270, February 2014. +[18] M. Kulbacki, J. Segen, W. Knieć, R. Klempous, K. Kluwak, J. Nikodem, +J. Kulbacka and A. Serester, "Survey of Drones for Agriculture Automation from Planting to Harvest," IEEE 22nd International Conference on Intelligent Engineering Systems (INES), 2018. +[19] M. M. Breunig, H.-P. Kriegel, R. T. Ng and J. Sander, "LOF: Identifying Density-Based Local Outliers," Proceedings of the 2000 ACM SIGMOD international conference on Management of Data, pp. 93-104, 2000. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +View publication stats +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt b/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt new file mode 100644 index 0000000..78390ab --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/12-Quality Model for Evaluating and Choosing a Stream Processing Framework Architecture.txt @@ -0,0 +1,202 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Noname manuscript No. +(will be inserted by the editor) +Quality model for evaluating and choosing a stream processing framework architecture +Youness Dendane Fabio Petrillo  Hamid Mcheick Souhail Ben Ali +2019 Jan +Abstract Today, we have to deal with many data (Big data) and we need to make decisions by choosing an architectural framework to analyze these data coming from dierent area. Due to this, it become problematic when we want to process these data, and even more, when it is continuous data. When you want to process some data, you have to rst receive it, store it, and then query it. This is what we call Batch Processing. It works well when you process big amount of data, but it nds its limits when you want to get fast (or real-time) processing results, such as nancial trades, sensors, user session activity, etc. The solution to this problem is stream processing. Stream processing approach consists of data arriving record by record and rather than storing it, the processing should be done directly. Therefore, direct results are needed with a latency that may vary in real-time. +In this paper, we propose an assessment quality model to evaluate and choose stream processing frameworks. We describe briey dierent architec- tural frameworks such as Kafka, Spark Streaming and Flink that address the stream processing. Using our quality model, we present a decision tree to sup- port engineers to choose a framework following the quality aspects. Finally, we evaluate our model doing a case study to Twitter and Netix streaming. +1 Introduction +More and more data is produced today, and dierent techniques have been developed in order to process this data. Due to modern Big Data applications, like sensors, stock-trading or even user web trac [6] data has to be processed +Universit du Qubec de Chicoutimi +Department of Mathematics and Computer science +555 boulevard de l'Universit +Chicoutimi, Canada +E-mail: dendaneys@gmail.com,fabio@petrillo.com,hamid mcheick@uqac.ca,souhail.ben- ali1@uqac.ca + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Title Suppressed Due to Excessive Length 13 +in real-time. The technique that can handle this problem is called : stream processing [5]. +So we have assisted to the rise of Stream processing frameworks, such as Samza and Flink, which are becoming more and more popular, for oering a model to ingest and process data at near real-time [7]. +However, with several stream processing frameworks and technologies associ- ated available, a problem arise : how to choose the right framework ? Each framework has its own features and is more or less dierent from another framework. +So, depending on the context, you choose the best solution. But another prob- lem occurs here : on what criteria are you basing on to answer this question ? In this paper, we provide a quality model for a decision taking. This model enforced by what we call variables/criteria, can help you through a decision and we see if it is suitable to choose stream processing framework. +We identify and explain in details four criteria that are important for the framework decision making. Further, we quickly present the selected frame- works with their pros and cons. The criteria and the frameworks have been chosen following a study of stream processing papers. We analyzed these pa- pers, and picked based on an average, the most redundant. +The rest of the paper is organized as follow, we analyze the related work that has been done (ii), and then answer to the previous questions by identifying what are the dierent criteria you have to base (iii) and by introducing the dif- ferent chosen stream processing frameworks (iv). We propose a decision model tree supported by the previous parts, that you can base on to choose the right framework technology (v). +2 State-of-the-art/ Related Work +A stream processing system requires four major elements: (1) Best under- standing of the streaming applications architecture (2) identication of key requirements of distributed stream processing frameworks (DSPF) that can be used to evaluate such a system, (3) survey existing streaming frameworks, (4) evaluation and a comparative study of the most popular streaming plat- forms. We divide the related work based on the three elements mentioned above. +2.1 Architecture of streaming applications +Streaming applications architecture is not too much dierent from web archi- tectures. Streaming sources are communicating using arbitrary protocols. So that, a gateway layer is set up to connect sources to streaming application and resolve the heterogeneity of sources protocols. A message queues are set up as a middleware to provide a temporary buer and a routing layer to match the accepted event sources and the applications [11]. +2.2 Requirements of distributed stream processing frameworks +There are eight rules [12] that serve to illustrate the necessary features required for any system that will be used for high-volume low-latency stream processing applications. +{ Rule 1: Keep the Data Moving by achieving a low latency +{ Rule 2: Query using higt level language like SQL on Streams (StreamSQL) { Rule 3: Handle Stream Imperfections (Delayed, Missing and Out-of-Order +Data) +{ Rule 4: Generate Predictable Outcomes +{ Rule 5: Integrate Stored and Streaming Data +{ Rule 6: Guarantee Data Safety and Availability +{ Rule 7: Partition and Scale Applications Automatically +{ Rule 8: Process and Respond Instantaneously +2.3 Existing streaming frameworks +Several streaming frameworks have been proposed to allow real-time large scale stream processing. In this section sheds the light on the most popular big data stream processing frameworks: +2.3.1 Apache Spark [15] +Developed at UC Berkeley in 2009 [19], is a platform for distributed data processing, written in Java and Scala. In spark, streaming computation is treated as a series of deterministic batch computations on small time intervals. +2.3.2 Apache Storm [18] +is a real-time stream processor, written in Java and Clojure. Storm is a fault tolerant framework that is suitable for real time data analysis, machine learn- ing, sequential and iterative computation. +2.3.3 Apache Flink [17] +is an open source processing framework supporting both stream and batch, It provides several benets such as fault-tolerant and large scale computation [14]. Multy functionalities are ored by this plateform such us additional high level functions such as join, lter and aggregation it allows iterative processing and real time computation on stream data collected by dierent tools such as Flume [20] and Kafka [21]. + +Fig. 1 Frameworks comparative +2.3.4 Apache Samza [16] +is created by Linkedin to solve various kinds of stream processing requirements such as tracking data, service logging of data, and data ingestion pipelines for real time services [14]. It uses Apache Kafka as a distributed broker for mes- saging, and Hadoop YARN for distributed resource allocation and scheduling [14]. +2.4 A comparative between processing frameworks +The comparison between those several frameworks listed above are data for- mat, types of data sources, programming model, cluster manager, supported programming languages, latency and messaging capacities [14]. +3 Paper Contribution +The work reported reported in this paper can be categorized under the class of decision help of choosing a stream processing framework. While there is a rich body of work in designing stream processing applications and huge comparative between these applications, a system that can help you to choose +the best application by criteria is still messing from contemporary stream processing systems. +In this paper we discuss some architectural frameworks such as Storm, Spark and others that resolve the Stream processing problem and we pro- vide a a quality model to choose ans evaluate a stream processing framework basing on some criteria such us latency, guarantees, fault tolerance and data processing model. +4 Survey of Stream Processing Frameworks +In this section, we will present 4 frameworks that are used actually to resolve stream processing problem. +4.1 Storm +Storm integrates with any database (e.g: MongoDB) and any queuing system (e.g: RabbitMQ, Kafka). +Storm works with tuples. A tuple is a named list of values and can contain any type of object. +Its API is simple and easy to use due to only three abstractions : +1. Spout : A spout is a source of streams and reads from a queuing broker. +2. Bolt : Where most of computation's logic goes. Computation logic can be functions, lters, streaming joins, streaming aggregations etc. So basically, from an input, and with computation logic you can produce new output streams. +3. Topology : A network of spouts and bolts. +Storm is scalable, fault-tolerant and have an at-least once guarantee mes- sage semantic. The cons here are that there is not ordering guarantees and duplicates may occur. +Another of its strengths is if a node dies, the worker will be restarted on an- other node. If a worker dies, Storm will restart it automatically. +At the date of writing this article, with Storm SQL integration, queries can +be run over streaming data, but it is still experimental. +Furthermore, Storm provides an exactly-once guarantee with Trident which is a high-level abstraction. This model is a micro-batch processing model that add a state and will increase latency. +4.2 Spark +Spark is an hybrid framework which means it can perform batch as well as stream processing. +Spark natively works with batch, but it has a library called Spark Streaming +that can allow to work with near real time data. It means that incoming data +are regrouped into small batch and then processed without increasing the latency too much unlike Storm which provides true streaming processing. +One of its power is that the manner you write batch jobs is the same you write stream jobs. More than that, it is fault-tolerant and has an exactly- once semantics. +Spark has its own modules that you can combine : +{ Spark SQL +{ Spark Streaming +{ Machine Learning +{ GraphX (for graph programming) +Spark runs in Hadoop, Apache Mesos, Kubernetes, standalone or in the cloud and access diverse data sources such as HDFS, Cassandra, etc. +4.3 Samza +Samza is decoupled in three layers [8] : +1. Streaming +2. Execution +3. Processing +4.3.1 Streaming +For the message queuing system, Samza uses Kafka. Kafka is a distributed pub/sub and it has an at-least once message guarantees. Kafka consumers subscribe to topic, which allow them to read messages. +4.3.2 Execution +Samza uses YARN to run jobs. It allow to execute commands on a cluster of machines after allocating containers. This is made possible because of YARN, which is the Hadoop's next generation cluster scheduler. So, YARN provides a resource management and task execution framework to execute jobs. +4.3.3 Processing +It uses the two layers above; input and output come from Kafka brokers. YARN is used to run a Samza job and supervise the containers. The processing code the developer write runs in these containers. Samza's processing model is real time. +One of Samza's advantages is that the streaming and execution layers can be replaced with any other technologies. Also, because of the use of YARN, +Samza is fault tolerant; Samza works with YARN to transparently migrate tasks to another machine. +The processing model Samza provides are both batch and stream (real time). Whatever the code you write, it will be reusable whatever the model. Switching models needs cong change; from HDFS to Kafka to pass from batch to stream processing. +4.4 Flink +Flink supports batch and real-time stream processing model. It has an exactly- once guarantee for both models. Flink is fault-tolerant and can be deployed to numerous resource providers such as YARN, Apache Mesos and Kubernetes; but also as stand-alone cluster. +One of the advantages of this framework is that it can run millions of events per seconds by using the minimum of resources, all of this at a low latency. Flink provides three layered API's : +1. ProcessFunction : It implements the logic, process individuals or grouped events and give control over time and state. +2. DataStream : Provides primitives for stream operations such as transfor- mations. It is based on functions like aggregate, map and reduce. +3. SQL : To ease the writing jobs for analytics on real time data. +5 Criteria used in frameworks +To choose a stream processing framework, we have identied some criteria. These criteria don't give you the answer on whether you should use stream processing or batch processing, but rather helps you take the decision to pick the right framework. So this step assumes that you already identied the problem and you came to the idea that should use stream processing model over batch processing. +We rst are going to give the criteria and explain them in details : +{ Latency +{ Message semantics (guarantees) +{ Fault tolerance +{ Data processing model (micro-batch or real-time) +5.1 Message semantics +Another term referring to this criteria is Message guarantees. The message guarantees can take three forms : +{ At least-once : could be duplicates of the same message but we are sure +that it has been delivered +{ At most-once : the message is delivered zero or one time +{ Exactly-once : the message is guaranteed to be delivered exactly one and +only one time +Before providing message guarantees, system should be able to recover from faults. [6] +5.2 Fault tolerance +Streaming application run for an indenite period, so it increases the chance of having faults. So this criteria is important, because despite the application has faults. +Fault tolerance guarantees that the system will be highly available, operates even after failures and has possibility to recover from them transparently. Flink has the highest availability. +5.3 Latency +Latency is the time between arrival of new data and its processing [10]. La- tency goes hand in hand with recovery (fault tolerance) because, whenever the system has errors, it should recover fast enough so the latency doesn't de- crease too much (i.e : the processing continue with minimal eect). Also, each framework can do do some optimization on data such as message batching, to improve the throughput, but the cost is sacricing latency. +5.4 Data processing model +To do stream processing, there is two techniques : +{ Micro-batch : based on batch processing but rather than processing data +that have been collected over previous time, data is packaged into small batches and collected in a very small time intervals and then delivered directly to the batch processing. Spark for example does micro-batch. +{ Real-time : data is processed on y as individual pieces, so there is no +waiting. Flink process data in real-time. +As messages are received directly the real-time processing technique has a lower stream processing latency than micro-batch but it become harder to have an exactly-once semantics. However, micro-batch provides better fault- tolerance and thus it can guarantees that the message has been received only once (i.e : Spark Streaming). +What we understand here is that message semantics are related to the fault tolerance and the data processing model, and according to how the fault tolerance is implemented the latency will increase or decrease. + +Fig. 2 Frameworks per paper + +Fig. 3 Criteria per paper +6 Quality Model for choosing and evaluating a SPF +After presenting the dierent frameworks and found the main characteris- tics/criteria, we came with a model. A model for evaluating the frameworks and choosing one given a set of criteria. In this section, we explain why we have chosen these particular frameworks and how we extracted certain crite- ria. Afterward, we explain how we have prioritized the criteria, and then, with all these information we present the quality model. +6.1 Methodology +There is several processing frameworks used in production today. But to nd +out what framework is used in which company is dicult and take time. So, our primary support was the research papers. We analyzed various papers about stream processing, and we dened redundancy as our benchmark. This means that we made a table with the papers and frameworks, and every time a paper cited a framework we gave a point to the paper. At the end, we had a table with the frameworks cited per paper. +We repeated the same process for the criteria. The result is on gure 3. +This paper is a rst draft, and we plan to study more papers to have more criteria and frameworks, and thus, to have better average results. +6.2 Choosing and prioritizing the criteria +After nding the criteria, we had to prioritize them. Here is the criteria ranked by importance. +1. Data model +2. Fault tolerance +3. Message semantics +4. Latency +The rst decision is what type of stream processing to choose, because this will have an impact on the other criteria. If you choose a micro-batch framework, it will be possible to have for each framework an exactly-once message semantics as opposite to a real-time model. +Latency is of great importance, but, a framework should be able to recover fast enough, so it does not aect the system too much (with minimum time). And before providing message semantics it also should be recover from faults automatically. Because it will inuence the other criteria beneath it, this is why the fault tolerance is in second position. +Depending on whether it is exactly-once or at least-once message semantics, the latency will change depending this criteria. +6.3 Decision Model Tree +Based on the previous parts, we present the decision model tree to evaluate and choose a stream processing framework (g. 4). +7 Case studies +In this section, we analyze some stream processing application cases. We go through two companies : Netix and Twitter. +The goal of this section is to see if our contribution in this paper correspond to the reality (i.e: real world application). In analyzing how and why these companies use stream processing frameworks, we can identify the main under- lying elements and compare them to our criteria. We get all information from papers and the companies tech blog. +7.1 Twitter +Twitter has actually an in-house framework called Heron. But before that, they were using Storm. We are going to detail framework evaluation for Storm, because Heron is an improvement but they are still using what we detail below. +The company that has made Storm was acquired by Twitter in 2011. Since, Twitter modied for their use. + +Fig. 4 The decision model tree +Let's begin with our rst criteria : data processing model. At Twitter, due to choosing Storm, as we described it above, it has a micro-batch processing model. So, just by using it, the choice of data processing model has been made. We go now to our second criteria : fault tolerance. When Twitter describes Storm [18], they say that one of the argument chosen to design Storm is : resilient (i.e : fault tolerant); their second criteria and ours correspond. As they say in the article [18], on of the feature key is the processing semantics or message semantics. They describe that their solution has two guarantees : at least once and at most once. This characteristic correspond to our third criteria we have mentioned. Further in the article, Ankit et al. report some experiment they have made that had to show the latency results. As they calculated, their latency is close to 1ms 99% of the time. Our criteria are justied by the design and the use of Storm at Twitter. +In this rst subsection, we can conclude that our criteria are match with the main characteristics of design and use of Storm at Twitter. +7.2 Netix +In their article [22], they describe Keystone which is their stream processing platform. The solution chosen to do stream processing is Apache Flink. By choosing Flink, they automatically chosen the real-time processing for the data model criteria. Then, they gave a summary of common asks and trade-os and one of them is failure recovery. This correspond with our criteria. One of the +asks was that the system is fault tolerant. If we follow our model, the next step is to choose the message semantics. In the post, their say that according to the use case loosing some events in the pipeline is acceptable while in other cases the event have to absolutely processed so it require a better durability. We see that this sentence is a synonym to our message guarantees criteria. In another post [23], they describe this time a real use case : to know what is trending on Netix. In order to that, they need real-time data of what users watch, the event is then send to be processed. They describe that one of their challenges was having a low latency. This last criteria match with ours. +What we can conclude in this section is that these companies followed a path which correspond with our quality model. All our criteria had been taken into account by these companies and are part of the core decision on choosing and using stream processing framework architecture. +8 Discussion +In this section we will discuss the impact of our results, impact as well on engineers as on researchers. This quality model can be used as a guideline when wanting to choose a stream processing framework. Answering what type of criteria is important for a given context will end to the choice of the right solution; do I need absolutely only one instance of data or is it permissible to have duplicates ? (i.e: at least once vs exactly once semantics). Answering to these questions based on the criteria we identied will help the engineers make the right choice quicker. Further, the use case of our model is not lim- ited to the choice only. Our model can be extended to serve to design a future stream processing framework architecture. When designing the solution, the model can help to see further steps on what will be implemented and thus the dierent dependencies it will have : when implementing the fault tolerance, the latency will increase or decrease given on how it is implemented. More over, thanks to the model, we see that the fault tolerance will also inuence the message semantics. So based on what we want to have as message guaran- tees, we will implement the fault tolerance in a dierent manner. In the other hand, researchers can use this model when wanting to evaluate a framework architecture. Also, this model, can be reused in order to compare dierent frameworks. When wanted, as part of their research, they can have a quicker and a better view on the dierent solution and what brings to them and how they are dierent and also similar. More over, when wanted and depending on their need, they can easily extend this quality model in order to adapt it to their work : adding a criteria will add complexity, and thus a possible dierent path. +9 Conclusion & Future work +With the huge amount of data generated, and given a stream processing con- text, choosing the right framework architecture is major. In order to do that, +we rst identied and explained what are the dierent criteria such as data model and latency... and presented some stream processing frameworks. We explained our methodology on how we came to choose the ideal framework ar- chitecture to fulll user's needs. Given these, we provided a decision model tree which is a quality model to choose and evaluate a stream processing frame- work. +There is more work that has to be done, in order to have more criteria and frameworks, thus to have a more complete and complex model. We can base on this model to evaluate and choose a framework architecture, and not only that, this model can also serve as a guide to designing a new stream process- ing framework architecture. It can also be used as a support to have quickly a global view of the dierent solution and what brings to them depending on the dierent criteria. +References +1. http://storm.apache.org +2. http://spark.apache.org +3. A Framework for Real-time Streaming Analytics using Machine Learning Approach, Proceedings of National Conference on Communication and Informatics-2016 +4. http://kafka.apache.org +5. Michael Stonebraker, Uur etintemel, Stan Zdonik. The 8 requirements of real-time stream processing. ACM SIGMOD Record Homepage archive, Volume 34 Issue 4, De- cember 2005, Pages 42-47. +6. Supun Kamburugamuve and Georey Fox : Survey of Distributed Stream Processing. +7. Fangjin Yang, Gian Merlino, Nelson Ray, Xavier Laut, Himanshu Gupta, Eric Tschetter +: The RADStack: Open Source Lambda Architecture for Interactive Analytics. +8. http://samza.apache.org +9. http://ink.apache.org +10. Andre Luckow, George Chantzialexiou, Shantenu Jha. Pilot-Streaming: A Stream Pro- cessing Framework for High-Performance Computing +11. Supun Kamburugamuve, Georey Fox : Survey of Distributed Stream Processing +12. Michael Stonebraker, Uur etintemel, Stan Zdonik: The 8 Requirements of Real-Time Stream Processing +13. Karan Patel, Yash Sakaria, Chetashri Bhadane : REAL TIME DATA PROCESSING FRAMEWORKS +14. Wissem Inoubli, Sabeur Aridhi, Haithem Mezni, Mondher Maddouri, Engelbert Nguifo +: A Comparative Study on Streaming Frameworks for Big Data +15. Apache Spark. Apache spark: Lightning-fast cluster computing, 2015 +16. Apache Samza. Linkedins real-time stream processing framework by riccomini 2014 +17. Apache Flink. Scalable batch and stream data processing, 2016 +18. Ankit Toshniwal, Siddarth Taneja, Amit Shukla, Karthik Ramasamy, Jignesh M Patel, Sanjeev Kulkarni, Jason Jackson, Krishna Gade, Maosong Fu, Jake Donham, et al : Storm @Twitter. In proceedings of the 2014 ACM SIGMOD International Conference on Management of Data, Pages 147-156 +19. Matei Zaharia, Mosharaf Chowdhury, Michael J Franklin, Scott Shenker, and Ion Stoica. Spark: Cluster computing with working sets. HotCloud, 10(10-10):95, 2010 +20. Craig Chambers, Ashish Raniwala, Frances Perry, Stephen Adams, Robert R Henry, RobertBradshaw, andNathanWeizenbaum. Flumejava: easy, efcientdata-parallel pipelines. In ACM Sigplan Notices, volume 45, pages 363375. ACM, 2010 +21. Nishant Garg. Apache Kafka. Packt Publishing Ltd, 2013 +22. https://medium.com/netix-techblog/keystone-real-time-stream-processing-platform-a3ee651812a +23. https://medium.com/netix-techblog/whats-trending-on-netix-f00b4b037f61 +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt b/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt new file mode 100644 index 0000000..ec0514f --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/14-Big Data Oriented Light-Load Embedded Performance Modeling.txt @@ -0,0 +1,115 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2020 IEEE 5th International Conference on Cloud Computing and Big Data Analytics +Big Data Oriented Light-Load Embedded Performance Modeling +Jinfeng Dou Jiabao Cao +College of Information Science & Engineering Department of Research and Development Ocean University of China Qingdao 266100, China Nokia Corporation +e-mail: jinfengdou@ouc.edu.cn Qingdao 266100, China +e-mail: william.cao@nokia-sbell.com +Xin Li, Lijuan Wang, Shuya Tang +College of Information Science & Engineering +Ocean University of China +Qingdao 266100, China +e-mail: 450751328@qq.com, 296189725@qq.com, tangshuya1995@163.com + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract—With increasing development of big data, the performance assessment and optimization face with a big challenge. The traditional methods widely use delivery-testing- analysis-solving (DTAS) ring. In big data area, big data environment is necessary for the testing phase in DTAS, which results in the big cost in both time and hardware. This paper proposes the big data oriented light-load embedded performance modeling. It ascertains the performance criteria to set the Capacity and Performance (C&P) factors. These factors will be embedded into the software with an on-off switch during the architecture, design and developing phases before DTAS phase. After the software coding done with embedded C&P factors, a small traffic load is run to collect the C&P data. The collected data will be used for the performance bottleneck finding, performance optimization, and forecasting the capacity and performance for various customers’ scenarios. Since the data easily help locate the issue, the required running traffic is small, and the problem solving is done before the traditional DTAS, this study is more suitable for the big data application. It can save more than 50% of time, decrease the software development efforts, and reduce the lab resources occupation. Finally, the proposed method is employed in the real prototype of an Internet of Things application, obtains the better capacity and performance, and the experiment data verify its effectiveness. +Keywords-Big data; capacity and performance; light-load; performance modeling; performance optimization +I. INTRODUCTION +With more and more fields applying Big Data and Internet of Things (IOT), the performance assessment and optimization of the software system face with a big challenge [1]. The capacity and performance (C&P) is the base and specific to the software system [2]. Take an example, the closure of issues in GitHub projects and the model of issue closure rates proposed cares about an improved understanding and prediction of the important measure of the development process performance [3]. An abundance of data in many disciplines of science, engineering, national security, +health care, and business has led to the emerging field of big data analytics (BDA) that run in a cloud computing environment [4]. +Applying traditional performance assessment and optimization, delivery-testing-analysis-solving (DTAS) ring, into the big data application has some problems, such as low efficiency, big testing and debugging effort and complex expensive environment. In the traditional ways, the performance engineering almost depends on the performance tester’s testing and lots of debugging again and again [5]. To process the emerging field of BDA that run in a cloud computing environment, the developers leverage Data- Intensive Scalable Computing (DISC) systems such as Google’s MapReduce, Hadoop, and Spark. While the developers have no easy means to debug DISC applications [6]. It still need lots of testing and debugging day and night with massive test cases for the coverage of big data. +Various call models are usually used when deploying a software in the customer site. It is composed of some kinds of scenarios with corresponding weights. In some C&P work [7-8], to identify the C&P of one call model, the testing work need be done again and again to find its top capacity and throughput. Moreover, various customers may have various call models. Then the testing work will take lots of lab sessions which mean a lot of human resources, a lot of lab equipment, a lot of power consumption, a lot of lab space occupation, etc. +To reduce the testing and debugging cost in time and environment for C&P monitor and optimization, some performance testing tools are introduced, e.g., Insure++ for the software by C/C++; Jcontract and Jprofiler for the software by Java; XHProf for the software by php. These kinds of C&P tools can help with debugging. However, it still needs repeated testing and complex expensive environment. +This study proposes the performance modeling based lightweight embedded C&P method (LECPM). The LECPM embeds C&P factors for the C&P monitor and statistics in the software interior. With a lower load running, e.g. 10% of + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +978-1-7281-6024-5/20/$31.00 ©2020 IEEE 476 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +required traffic, the performance engineer can get the C&P statistics and analysis for the software, find and resolve the bottlenecks and related problems before delivering to integration testing. Since the used load is small, a lot of lab resources can be saved, and repeated testing can be reduced as a lot of lab sessions will be saved. Much earlier the bugs are found, much less the development and maintenance efforts will be. +II. RELATED WORK +A performance testing method for embedded software platforms was described, which analyzed the performance constraints of the platform to improve software quality and performance into account during early development stages, test system reliability [9]. The model allowed to take as well as to perform regression testing. The study modeled a system process based on load testing and profiling data to produce representative workloads, create profiler snapshots, and get performance hotspot reports [10]. The performance issues are identified and matched with the specification of antipatterns. A formalism, stochastic performance logic, represented performance requirements, which can identify performance differences in realistic unit test scenarios [11]. An automated approach, PerfLearner [12], extracted execution commands and input parameters from descriptions of performance bug reports, and used them to generate test frames for guiding actual performance test case generation. The study used a declarative domain specific language (DSL) drive the end-to-end process of executing performance tests [13]. A model-driven framework can specify the performance intentions by relying on a powerful target-oriented language. A systematic literature review identified 208 fault prediction studies published from January 2000 to December 2010 [14]. The methodology used to build models seems to be influential to predictive performance. A software model can be analyzed for nonfunctional requirements by extending it with suitable annotations and transforming it into analysis models for the corresponding nonfunctional properties [15]. Communication Sequential Processes (CSP) and the model checker Process Analysis ToolKit (PAT) [16] modeled and verified the OpenFlow scheduled bundle mechanism in software defined networking (SDN), which guaranteed the completeness and consistency of messages transmitted between SDN switches and controllers during the communication process. +Some study gives the method to resolve part of the performance issues. Most study almost depends on the performance tester’s testing and lots of debugging again and again, and most performance is mainly about fault finding. The testing work will take lots of lab sessions. Various customers may have various call models, so many similar call models need repeated testing, and these testing will take huge of these resources. This paper introduces the performance modeling that helps engineer find C&P related problems before delivering to integration testing, and reduce the development and maintenance efforts. +III. LIGHT-LOAD EMBEDDED PERFORMANCE MODELING AND CASE STUDY +We propose LECPM to use low traffic to get the C&P factors composing of the performance engineering base, C&P data. The C&P factors may include the external resources and internal resources, such as CPU, shared memory, message queue, global objects, etc. With these base C&P data, we can compose any call model and give the estimation for each call model for the validation, hence much testing work will be reduced. The C&P data will also clearly show the critical point of the capacity and performance, so the related problems can be much easier found, analyzed and resolved. Moreover, the work in LECPM is done before DTAS, much earlier the bugs are found, much less the development and maintenance efforts will be. +The performance engineering designates and validates the C&P data, provides the resolutions to optimize the system C&P, and implement the call model engineering with forecasting the system C&P. The LECPM can use the base C&P data but not the personal experience as the chief gauge, which is a much more scientific way. This engineering requires the performance engineer to involve the software development from the beginning of the system requirements analysis. The performance engineer need work with the system engineer to analyze the requirements, work with the architect to be familiar with the software architecture and to give the performance related comments to the architect, need start to write code in the early phase of software framework design and coding, and will start the performance initial analysis after the software framework done and before the functionality implementation. The detail work flow is shown in Fig. 1. It covers embedding C&P factors, C&P statistics and optimization, and C&P forecast. In this section, we will demonstrate how performance modeling is, how is it done, and finally we use the experiment data to verify it. +Figure 1. The performance modeling work flow +A. Performance Modeling Base-AASI +The base of performance modeling is the abundant C&P data. The C&P data is conditionally embedded into the software. The embedding work has 4 steps named AASI in Fig. 2. They are: Ascertain specific C&P factors, Analyze the software architecture and split it module by module and + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +477 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply. + +interface by interface, Specify the C&P data, and Implement the embedding of the C&P factors and the statistics of the C&P data in the software. The prior 3 steps are called AAS. +Figure 2. AASI model + +Figure 3. The CPU variation with different traffic +The C&P factors include the exterior resources and interior resources. The exterior resources are common to all kinds of software; they may be CPU usage, shared memory, network bandwidth occupation, the disk usage, the DB resources, etc. The interior resources are specific to the certain software, may be message queue, some certain global objects, count of threads, etc. The C&P factors may be some of them which depend on the software’s usage scenario and architecture characteristics. +Here we need study the specific software architecture. Any software can be modularized, and the modules communicates with each other using the public or private interfaces, and some modules may also communicate with external resources or third party applications using public interfaces. These interfaces may be some global objects, some message protocols, the files, the shared memories, DB objects, etc. +In addition to the C&P factors ascertainment, modularization and interfaces identification, the software application scenarios need to be identified. What we should do is to identify each single scenario. All of them will be used to specify the C&P data. Actually any above C&P factors can be used for the C&P data. The C&P data could be like the CPU time used in one module and/or in one message, it can be counted with average value in a certain time, or be counted with the total value in a certain time. The experiment shows that the average value in a certain time is much more useful and much easier to be compared and to be analyzed. The network bandwidth can also be as the C&P +data. We can count the messages size in a certain time when they are transferred between the modules or between the module and external network element. They can be shown finally as the network bandwidth statistics. If the message queue is used in the software to have the modules interior communication, the message queue status need be taken as the C&P factor; it can be the size of queue, or be the hold time for the queue. Take one more example, in some software, some global object is used to be the critical shared resources among some modules, then it must be used for the C&P data. The performance engineer may care about its total size any time, or about its variation trend. The final step, the embedding implementation, is to apply the above analysis and design into the deployed software. Definitely it should be a feature of this product, and it also has the common software development cycle. It should be enabled or disabled easily, and it will only be used in the development lab. It will not take effect in the site, and will not and should not have any impact to the software when deployed in site. For the implementation, it is suggested that in the early development phase, i.e., once the software architecture is designed, these C&P data should be embedded into so that it can validate that the software adopts and implement a healthy architecture. +B. C&P Monitoring and Optimization +The software C&P is measured with the data of traffic throughput under the certain CPU level. We often set the CPU level as 45% or so for the max normal load in most healthy software especially related to the human behaviors, and before the CPU usage reaches at 40~50%, the CPU usage variation is linear with the traffic, as is verified in the experiment, shown in Fig. 3. The probability of the certain traffic load occurrence is following the Poisson distribution [17]. In probability theory and statistics, the Poisson distribution is a discrete probability distribution that expresses the probability of a number of events occurring in a fixed period of time if these events occur with a known average rate and independently of the time since the last event. For example, suppose there is a telecommunications application, this application is serving people the communications. In the dimension of time, the communications traffic sometime is busy, and sometime is idle, we can say that the traffic occurrence follows the Poisson distribution. What we want to ensure is that the system works with a good criterion (e.g. 99.999% successful rate) when the traffic load is not greater than the most possible traffic load (with the biggest possibility) per the Poisson distribution theory; and may allow more errors when the traffic load is much greater than this value and reaches at its top, which is defined by the product manager or by the customer. For a healthy and economic software, the CPU usage under the above stated traffic load is 40~50% so that it can be tolerant of peak traffic load with enough CPU space. +With above analysis, we will monitor that how many traffic throughput is supported by the aimed software under 45% CPU usage. And how big is its supported capacity. Here we will get the CPU time, global objects status, and corresponding memory occupation for each typical single scenario, which are the C&P data base. These kinds of data + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +478 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply. + +are what we should monitor. In the performance modeling, we can first use 2 or 3 little call load to get the base, and then with these data and the linear variation below 45% CPU usage to evaluate the rough call load under 45% CPU, finally validate it. So the overall testing effort will be much reduced. +It is recommended to implement the performance modeling in the software early development as shown in above Fig. 1. Thus in the early development phase, the system performance related problems will be found early. How are they found? In above sessions, this paper stated that the CPU time will be counted in each module, all the message queue status will be monitored, and the global objects variation trend will also be tracked. After analyzing these C&P data, we will compare the CPU time and analyze its reasonability by each module. If the module A takes about 2% CPU time, however, the similar module B takes about 20% CPU time, then we can say that there is something wrong in module B. Moreover, if each message handling takes about 1 second in module C, we can say that module C is abnormal since the message handling should only consume the millisecond level. With the tracked global objects variation trend, if it is not flat but increasing, we can judge that there is some memory leak for these global objects. For the message queue, when using a higher call load, the message queue size increases for module B, we can say that module B has little ability to handle its messages; its ability need be improved by either multiple threads or by enhancing processing capacity of the single thread. We can see that this kind of optimization takes less effort than the traditional methods, and can be verified easily. With this method, the capacity issue can be easily found, and the developers can also check if the new code involves capacity issues using the less-effort performance modeling testing. +In one real case, shown in Fig. 4, we developed a typical web server with database in an IOT application, which serves the end user for the http request including data query and input, and for the http notification of the received IOT data. The performance modeling method is used in this product to find the capacity issues so as to resolve them. This software uses the average processing time and the average awaiting time as the C&P data. As shown in Fig. 5, we can see that the average awaiting time in the module DataProcessingModule is abnormal, and the average processing time in the modules DataProcessingModule and DBWriteModule are abnormal. The average awaiting time value of other modules is 100 or so, however, the DataProcessingModule is greater than 1000. Most of the average processing time is about 300 or so, and DataProcessingModule and DBWriteModule are greater than +1000. With the software architecture analysis, the abnormal data in DBWriteModule is caused by the database update operation which is reasonable and acceptable. What we should resolve is DataProcessingModule. The awaiting time means that the messages put into this module can’t be handled immediately. The awaiting time is close to the processing time in DataProcessingModule, after analyzing the software architecture, we find that this module is a single thread, the later coming messages must be wait until the +previous messages completes. So we change this module to be multiple threads to resolve this issue. For the big average processing time in this module, we note that the logic in DataProcessingModule is the memory operation but not disk operation, so the big processing time is unreasonable. After comparing with the initial C& P data without functionality applied, we found that the pure software framework is excellent in this module. With the quick temporary C&P factor added and test, it is found that one system call related to the time is called, which consumes a big CPU time. The final enhance work and the testing results on these enhancement shows that the system is healthy with good C&P data. +C. Call Model Engineering Based on C&P Forecast +The call model definition or requirements mainly comes from the customer sites or from the product manager. When the software is deployed in the customer sites, various customers will have various kinds of call models, and even the same customer will have different call models in the different period. The performance engineering based performance modeling provides an easy way for the call model engineering, which avoids doing much test and saves much effort. This call model engineering is to forecast the C&P based on the C&P data of each single scenario together with the software architecture decomposition data, such as the module hit of each single scenario. +Figure 4. The Web Server software modules and interfaces + +Figure 5. The initial C&P data + +Figure 6. The C&P forecast and real test result comparing + +Figure 7. The module hit of each single scenario +Let’s continue to use the web server with database in an IOT application as the example. One customer needs the call scenario with 200 tps (transaction per second) of query + 500 tps of IOT data report, and wants to know the hardware requirement. As shown in Fig. 6, we have had the C&P data of each single scenario, query only and IOT data report only. +With the software architecture decomposition, each single scenario has the module hit data show in Fig. 7. Fig. 7 indicates how many times each module is called per scenario. We estimate the draft CPU usage according to the subtotal of the time of each module as shown in Fig. 6 and the given tps in each single scenario. The estimation method is: +First get the estimated subtotal in certain module: The estimated subtotal in certain module = * + * . By the way, we can also get the draft average time using the equation: average time = /. +Then the estimated CPU usage can be calculated using method: ((CPU usage by query only + CPU usage by IOT data report only)/2) * (((< total time of query> + < total time of IOT data report)/2)/< total time of the estimated subtotal>. +Finally what we estimated by this engineering method is that 100 tps of query + 500 tps of IOT data report need 63% CPU. The official supported top CPU is 45%, so we need deploy 2 instances of the server platform to support the customer. The experiment validated that this engineering method is close to the real testing result. +IV. CONCLUSIONS +Generally, the performance modeling proposed a better method of the performance engineering. With this method, the C&P factors were embedded into the software architecture, which helped the performance engineer easily nail down the capacity issue with little temporary debugging +code since the C&P data gives detail, helped the performance engineer quickly get the C&P data for the specific call models, and could help the developer quickly find if the new change on the software has capacity issue. These explicit is suitable for the big data background. It benefits save a lot of development effort and raise the product competitiveness. The future research will be on how to implement a common implant and how to study the general estimation tool. +ACKNOWLEDGMENT +This work was financially supported by the Shandong Natural Science Foundation (ZR201702170341) and Postgraduate Education Quality Improvement Program (HDYJ18008). +REFERENCES +[1] Q. Liu, Y. J. Fu, G. Q. Ni, J. M. Mei, “Big Data Management Performance Evaluation in Hadoop Ecosystem”, 2017 3rd International Conference on Big Data Computing and Communications (BIGCOM), Chengdu, China, pp.413-421, 10-11 Aug. 2017. +[2] B. Boehm, “Improving and Balancing Software Qualities”, 2016 IEEE/ACM 38th IEEE International Conference on Software Engineering Companion, Austin, TX, USA, pp. 890-891, 14-22 May 2016. +[3] J. Oskar, J. Szymon, W. Adam, P. Kamil, J. Michal, “Surgical teams on GitHub: Modeling performance of GitHub project development processes”, Information and Software Technology, vol. 100, Aug 2018, pp. 32-46. +[4] F. Xu, H. Zheng, H. Jiang, W. Shao, H. Liu, Z. Zhou, “Cost-effective cloud server provisioning for predictable performance of big data analytics”, IEEE Transactions on Parallel and Distributed Systems, vol. 30, n. 5, pp. 1036-1051, May 1, 2019. +[5] J. Y. Wang, “An imperfect software debugging model considering irregular fluctuation of fault introduction rate”, Quality Engineering, v 29, n. 3, July 2017, pp. 377-394. +[6] M. A. Gulzar, “Interactive and Automated Debugging for Big Data Analytics”, 2018 IEEE/ACM 40th International Conference on Software Engineering: Companion, Gothenburg, Sweden, pp. 509- 511, May 27 - June 03, 2018. +[7] O. Jarczyk, S. Jaroszewicz, A. Wierzbicki, K. Pawlak, M. J. Lorek, “A software quality framework for large-scale mission-critical systems engineering”, Information and Software Technology, vol. 102, October 2018>*pp. 100-116. +[8] R. Riccardo, Z. Lamberto, F. Alberto, A. Ilan, “Big data analytics capabilities and performance: Evidence from a moderated multimediation model”, Technological Forecasting and Social Change, vol. 149, December 2019. +[9] A. Shen, M. Kuzlu, M. Pipattanasomporn, S. Rahman, L. Chen, “ A performance testing method for embedded software platforms”, 2016 IEEE International Conference on Cyber Technology in Automation, Control, and Intelligent Systems (CYBER), Chengdu, China, pp.135- 140, 19-22 June. 2016. +[10] C. Trubiani, A. Bran, A. Hoorn, A. Avritzer, H. Knoched, “Exploiting load testing and profiling for Performance Antipattern Detection”, Information and Software Technology, vol. 95, March 2018, pp. 329- 345. +[11] B. Lubomír, B. Tomáš, H. Vojtěch, K. Jaroslav, M. Lukáš, T. Tomáš, +T. Petr, “Unit testing performance with Stochastic Performance Logic”, Automated Software Engineering, vol. 24, n. 1, March 2017, pp. 139-187. +[12] X. Han, T. T. Yu, D. Lo, “Perflearner: Learning from bug reports to understand and generate performance test frames”, ASE 2018 - + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +480 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply. + +Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering, Montpellier, France, pp. 17-28, 3-7 September 2018. +[13] F. Vincenzo, P. Cesare, “A declarative approach for performance tests execution in continuous software development environments”, ICPE 2018 - Proceedings of the 2018 ACM/SPEC International Conference on Performance Engineering, Berlin, Germany, pp. 261- 272, 9-13 April 2018. +[14] T. Hall, S. Beecham, D. Bowes, D. Gray, S. Counsell, “A systematic literature review on fault prediction performance in software engineering”, IEEE Transactions on Software Engineering, vol. 38, n. 6, pp. 1276-1304, 2012. +[15] M. Woodside, D. C. Petriu, J. Merseguer, D. B. Petriu, M. Alhaj, “Transformation challenges: from software models to performance models”, Software and systems modeling, vol. 13, n. 4, pp. 1529- 1552, 2014. +[16] H. W. Wang, H. B. Zhu, L. L. Xiao, W. L. Xie, G. Lu,” Modeling and Verifying OpenFlow Scheduled Bundle Mechanism Using CSP”, 2018 IEEE 42nd Annual Computer Software and Applications Conference (COMPSAC), Tokyo, Japan, pp. 376-381, 23-27 July 2018. +[17] I. Ruiz-Rube, J. M. Dodero, R. C.Palacios, “A framework for software process deployment and evaluation”, Information and Software Technology, vol. 59, pp. 205-221, 2015. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +481 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 06:33:13 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt b/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt new file mode 100644 index 0000000..796daab --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/16 - Data Quality Management for Big Data Applications.txt @@ -0,0 +1,198 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2019 Developments in eSystems Engineering (DeSE) +Data Quality Management for Big Data +Applications + Majida yaseen khaleel Prof. Dr. Murtadha M. Hamad + Department of Computer Science D eUpanrivtmeresnittyoof fCAonmbpaurte r Science University of Anbar + Ramadi, Iraq Ramadi, Iraq majdhsyasyns@gmail.com dr.mortadha61@gmail.com + Abstract— Currently, as a result of the continuous increase Several Data Warehouses (DWs) were developed in of data, one of the key issues is the development of systems and different fields. Nevertheless, today's DWs face new applications to deal with storage, management and processing scientific problems. Heterogeneous, independent, scalable of big numbers of data. These data are found in unstructured and distributed are the current sources of data. With the ways. Data management with traditional approaches is difficulties involved, the traditional data warehouse faces inappropriate because of the large and complex data sizes. some constraints, summarized with the following sentence: Hadoop is a suitable solution for the continuous increase in non-existence of scalability owing to problems in data sizes. The important characteristics of the Hadoop are processing combined with natural data. Data nature: new distributed processing, high storage space, and easy semi-structured and unstructured data models and formats administration. Hadoop is better known for distributed file +systems. In this paper, we have proposed techniques and have created the need for modern data warehouses to be algorithms that deal with big data including data collecting, integrated and used, but traditional DW can not. +data preprocessing, algorithms for data cleaning, A We have proposed a technique for converting Technique for Converting Unstructured Data to Structured unstructured data to structured data using metadata , Data using metadata, distributed data file system +(fragmentation algorithm) and Quality assurance algorithms distributed data file system (Fragmentation algorithm) and by using the model is the statistical model to evaluate the quality assurance algorithms that decrease above highest educational institutions. We concluded that Metadata limitations and the summation of total query maintenance accelerates query response required and facilitates query cost and response time of the selected views which is execution, metadata will be content for reports, fields and regarded the view selection problem. +descriptions. Total time access for three complex queries in +distributed processing it is 00: 03: 00 per second while in non- II . BIG DATA DEFINITION +distributed processing it is at 00: 15: 77 per second, average is The term big data refers to a huge amount of information approximately five minutes per second. Quality assurance that comes from several sources. Therefore big data do not note values (T-test) is 0.239 and values (T-dis) is 1.96, as a +result of dealing with scientific sets and humanities sets. In the only refer to this huge volume of data but also the variety comparison law, it can be deduced that if the t-test is smaller of data forms, which are supplied at different speeds [2]. than the t-dis; so there is no difference between the mean of By 2020,there will be around 20-100 billion connected the scientific and humanities samples, the values of C.V for devices leading to more data collection; thus illustrating both scientific is (8.585) and humanities sets is (7.427), using a necessity for applying big data analytics [3]. This takes the law of homogeneity know whether any sets are more forth the requirement of understanding big data. See Fig homogeneous whenever the value of a small C.V was more 1.[4]. +homogeneous however the humanity set is more homogeneity. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Keywords— Big Data, data quality, unstructured Data Distributed data file system, and statistical model. +I. INTRODUCTION +Currently, large data volumes appear unprecedented in heterogeneous sources (eg Commercial and educational, finance). The proliferation of smart computers and Internet of things will make them a very technical nature . Strong systems and distributed programs behind the scenario support multiple overlapping systems (for example, smart grid systems [1]. + Until the big data revolution, traditional technology lacks high storage capacity, keeping all the archiving for a long time and running large data since large data comes from different sources so we need ways to deal with it, big data needs massive data sets to be cleaned, processed, analyzed, secured, and textured. Analysis of data in companies and industries is becoming increasingly important for competing, finding new ideas and personalizing their services. [1] + +Fig. 1.volume versus variety +A. Reasons for Appearance of Big Data + Recently, there have been some things that have helped this explosion and increase in size and diversity, including: + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +978-1-7281-3021-7/19/$31.00 ©2019 IEEE 357 +DOI 10.1109/DeSE.2019.00072 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +1. Some regions have very large data for analysis such as meteorology (weather science), genetics (genomics), complex physical simulations, and biological and environmental research [2]. +2.Low storage cost laws that require the continuation of the data in the database to track criminals, vandals and intruders [2]. +3. The advent of Internet technology (IoT), which allows all devices to communicate and interconnect Internet technology and new data production, doors and windows and walls and refrigerators and everything at home connected to the Internet and interact with it [2]. +4. The emergence of social networks (MySpace, Facebook, tweeter and Google) that send large amounts of data over time and various bodies [2]. +III. RELATED WORKS +1) In 2012, by Abdullah Farhan Mahdi [6] Since On Line Analytical Processing (OLAP) is essential in decision- making He built a model for distributing information to several computers linked to a network using the fragmentation algorithm and conducted a query on these computers, the findings resulted in the velocity of complicated issues being implemented in a lot of relative time [6]. +2) In 2015, Jie Songa, Chaopeng Guoa, Zhi Wanga, YichanZhanga, Ge Yub and Jean-Marc Piersonc [7] this paper presents Hadoop based Olap (HaoLap), an OLAP system for big data. designed an OLAP based on hadoop and applied several algorithms to each particular work to perform roll up operation on dimension hierarchy using the dimension coding and traverse algorithm then stored the dimensions and measurements using the partition and linearization algorithm. Results with efficient performance in OLAP and complex query [7]. +3) In 2017, Xiaolei Li, Zhenyu Tu et al., [8] By using big data analysis to enhance performance and enhance rates, new company opportunities can be acquired. The data analysis was introduced using industrial enterprises and the off-line data reference model library were developed. By using Spark to introduce the web application that is used with the production of Real Time [8]. +4) In 2017 Sonia Ordoñez Salinas and Alba Consuelo Nieto Lemus [9] Opinions differed regarding the warehouse data and large data some concluded the disappearance of the repository data with the existence of large data, while others completed the integration of the two by discovering the points of convergence and difference between them and the work of joint tasks [9]. +5) In 2018, Konstantinos Vassakis, Emmanuel Petrakis and Ioannis Kopanakis [10]. The huge increase in data varies from one generation to another. In the previous generation, the increase of industrial companies, people and advanced technology led to competing companies among them, but now the increase is the result of the Internet and social networking sites that are growing rapidly [10]. +IV. THE PROPOSED SYSTEM +The proposed system illustrates the main steps from data collection to results obtained using the following algorithms and techniques . +A. The Role Of Metadata + Metadata are an effective task of managing and organizing data while storing it because of the lack of +effective mechanisms such as metadata. Metadata refers to +data that describe other data. It adds more organization to +the data structure, such as the database, and also describes unstructured data such as maps and media Multiplayer [11]. +B. A Technique for Converting Unstructured Data to Structured Data using Metadata approach It is difficult to find a tool for dealing with non- +structured data that can store and retrieve data that are +generated in a structured database. The following steps will +be taken to access non-structured data in the handwriting +form. +Algorithm1 for Converting Unstructured Data to Structured Data using Metadata approach +Inputs: unstructured Data. Outputs: structured Data. +_____________________________________________ Start +Step1. Input unstructured data (with various sources). Step 2. Select an affected parameters (features). +Step3.Using these features to create structured metadata using data modeling (relationships) for this purpose. +Step4.Apply (Classification or Clustering task) or any mining or statistical methods (machine learning) for an +efficient accuracy(quality) results +Step5.Data Visualization. End. +C. Distributed Processing. + The distributed file system is a major challenge in dealing with large data as it uses several computers connected to each other using any available networks and in the case of a specific query will be sent to these computers and respond to rapid response and thus saves time in retrieving data [6]. +1. Data Fragmentation + To handle large data, the data are fragmented either horizontally or vertically according to the Fragmentation algorithm to several computers and then dealing with the architecture of Client - Server in the need for a specific +complex OLAP [6] . +2. Replication of data + Replication is one of the technologies used to copy the data to more than one site to maintain in the case of loss of data from the designated place because it is located in the other and used with the process of fragmentation as integrated work in the architecture of Client -Server therefore, the data are stored more accurately and provide more data and give a detailed report of anything whether homogeneous or not [6]. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +358 +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. + +3) Network Regulation + Distributed data operation within the network environment, where possible, should be within the area of building (LAN) or city(MAN). Implementation of the system was based on an internal network (LAN) within organization building. The work will be in the architecture Client -Server [6]. +D. Data Quality + Quality is a smart tool for applying sustainable development for all parts of the system at any organization. This is the application of development methods to ensure quality, improvement, sustainability and implementation at high level in practice, operations and performances. [12]. +• General Model of Evaluation +The statistical models are used to evaluate the highest educational institutions based on standard model. The model is used to evaluate the faculty members in these institutions. The faculty members model is based on five measures and each measure is based on standard ratio with the final evaluation measure obtained from the sum of all the five measures with a rate of 100%. These measures are (Scientific Performance with a rate of 35%, Teaching Efficiency with a rate of 25%, Educational Performance with a rate of 10%, Personal Conduct with a rate of 20%, Foundation Performance with a rate of 10). The performance of the scientific colleges is compared with the performance of the humanism colleges depending on colleges evaluation results with statistical forms using the (T-test) for comparison and the (COV) to know the homogeneousness between the scientific colleges and the humanity colleges[12]. +• The Arithmetic Mean + Using (1) and the percentage law we can be find the final average to evaluate the university then to the college and then each person in this college [12], +¦n X +X = i=1 i (1) +n +To compute the arithmetic mean we use (1) Where n is the size of sample +The arithmetic mean (or average) of the squared deviation (Xi −X)2 is called the variance. The variance denoted +symbolically by s2 . Its formula is: +¦n X −X)2 += i=1 ( i (2) +s2 +n−1 +Where n is the sample size. + The square root of the difference is the standard deviation, as shown in (3). It is used to determine the dispersion of the performance of scientific colleges and the dispersion of the performance of colleges of humanity. +The (S) symbol refers the square root of standard deviation +of variable x .[12]. +¦n (Xi −X)2 +s = i=1 (3) +n−1 +• Statistical Comparison Functions + Statistical comparison has several functions. Here, two comparisons of statistical comparisons were performed on the basis of each of the two components between the performance of comparative scientific colleges and the performance of humanitarian colleges in the following form: +A. T-test + T-test is used to compare between two separate accounts mediums. Its mathematical formulations are illustrated in (4) It depends on the mean and variance of the two sets. Also it brings on a degree of freedom (df) and identify the moral (.), in order to find ( t scheduled ) which can be found from the intersection of (df) with (.)[12], +(X −X )−(μ −μ ) +t = 1 s2p 2 1 + 11 §¨2 (4) · + +n1 n2 © ¸¹ + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +359 +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. + +¦n By sample size(n) is the sum of all measurements where X1 and X2 = means of samples 1 and 2 +aatr hevn epedrreaxaitsvgise•eendrTatiehsgpdeee b.rVvsyaiaolrtunihae.ensI ctcteihosaatrihntasecdt cmethereen(dtSSriaa2taln)n.oSdIfqat rutidhas erDecsomevmoaiftaphtduieeotmevniafatrtioicomsn sat nhodef populationsn11 masa2nn2edda0= nsn2s2. ==1 sstiaz1ne)dssao1rdf + sdae(mvnip2alteiso n1sao2nfds2amples 1 and 2 Ti=h1e average or the percentage is called the arithmetic (μ1 −μ ) = hypothesized difference between the +Xi + The variance is a measurement for variation of the data +scientific (2) which represents the variance to a sample[12]. ( n − 2 − 1 ) s 2 +Deviation is the difference between an individual data with p n +n −2 +value xi and the mean X and, it is called the deviation of 1 2 +Xi s2 from X , that is deviation = Xi −X and df = n1 +n2 −2 , Confidence interval for μ1 −μ2 +1 + 1 (X1 −X2) ±tσ / 2 s2p (n1 n2 ) + With σ =(1_ Confidence coefficient). +there is a difference between the average of the two samples if the t calculated is greater than the t scheduled. Otherwise, there is not a difference between the average of the two samples if the t calculated is lower than the t scheduled. +B. The Coefficient of Variation + Equation (5) is a statistical function to compare between two different samples based on standard deviation. It is used to find out how distortion data is in the data, where the higher the data indicates that the data is dispersed, +indicating that the data is more homogeneous and vice Fig.3. the original data set. +versa. + To handle large data, you can defragment vertically by the following example "SELECT * FROM item Where +c.v = s × 100 item_ quentety = 209"; see fig.4. (5) +X +V. THE RESULTS AND DISCUSSION + In this section , the execution of the proposed algorithms for converting unstructured data to structured data using metadata ,distributed processing(fragmentation), and data quality, which helps decision makers to obtain good results and to make the right decisions . +A. Metadata of Sales + In this section of the proposed system the description of the files (tables) used in data warehouse and details of the reports again the sales system : 1. Metadata for tables that used in sales system. 2.Metadata for complex OLAP query(reports) against sales system. For example Metadata of item Table in table 1. +TABLE.1. METADATA OF ITEMS TABLE + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +361 +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. + + +B. Distributed processing +• Data Fragmentation + To handle big data, R are the original data to be split into horizontal data (R1) or vertical data (R2) that contains sufficient data then retrieve the complex queries required from these fragments . It is possible to return the fragments to their original data by collecting them. see fig,3. +Fig.4. Vertical fragmentation + And to handle large data, you can defragment horizontally by the following example "SELECT item_id, item_name, item_code FROM item”; see fig.5. + + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. + +Fig.5. horizontal fragmentation + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. + + By applying the proposed system algorithms, we found: First: Response Time of Query + The query response time in the OLAP and decision support systems is critical and very important. By applying distributed processing algorithms to the sales system, we concluded that when processing large data time saving (i.e. the system requires a few minutes), high quality and data retrieval speed. Therefore, the implementation of the query on the distributed processing provides us with fast response time and speeds up decision making. See fig. 6. +00:14:24 with out dis. +processing 00:07:12 distributed +processing 00:00:00 +total Q3 Q2 Q1 +time +Fig.6 . Execution time of OLAP query in Distributed processing +Second : Evaluation of higher education institutions + We can apply statistical models to the big data were to be Iraqi universities and evaluated according to the standards mentioned and therefore we applied statistical models at the level of Anbar University as a sample of Iraqi universities . Evaluate and Compare Science with human Section The percentages are illustrated in table 2,3,4. + After taking several colleges and applying them a statistical models to five measures. The following results are illustrated in different fig.7 and fig.8. + +Fig.7. Rate assessment of final evaluation of the colleges + +Fig.8 .Rate assessment of scientific and humanity colleges +TABLE 2. EVALUATION OF THE SCIENTIFIC SECTION WITH HUMANITIES + +TABLE3. A COMPARISON OF TWO SETS TO KNOW DIFFERENCE + +TABLE 4. COMPARED TO THE TWO SETS TO KNOW HOMOGENEITY + +VI. SYSTEM EVALUATION + The design and implementation of proposed system can +be evaluated as: . +1. response time: we used the proposed system to process +large numbers of data and realized that it would take a few +minutes or seconds to answer the complex queries. +2. Ease of application: algorithms can be applied using any programing environment. +3. Accuracy: the accuracy of query optimizing based on the +selection best set of views and tables that will be used for +creating new query by applying proposed algorithm for optimizing the query. + We compare this thesis results with other results based the following factors in the table 5. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +362 +Authorized licensed use limited to: University of Wollongong. Downloaded on May 31,2020 at 12:23:03 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt b/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt new file mode 100644 index 0000000..bfd345b --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/17-Research_on_Security_Detection_and_Data_Analysis_for_Industrial_Internet.txt @@ -0,0 +1,109 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2019 IEEE 19th International Conference on Software Quality, Reliability and Security Companion (QRS-C) +Research on Security Detection and Data Analysis for Industrial Internet + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Lin Jun +China Electronic Product Reliability and Environmental Testing Research Institute, + Guangzhou, Guangdong, China, 510610 Email: linjun@ceprei.com +Abstract— Industrial Internet platform needs to solve a series of problems, such as access of multi-type industrial equipment, multi-source industrial data integration, massive data management and processing, industrial Internet security and so on. This paper builds industrial big data analysis algorithm library based on domain knowledge modeling and big data analysis of industrial data. Through the analysis of the behavior characteristics of industrial internet network traffic data, this paper studies the method of selecting traffic characteristics of events in the industrial Internet; establishes the propagation and evolution model of security events in the industrial Internet, and builds a traceability map of security event propagation; This study combines the characteristics of large data volume and centralized control of future industrial Internet to reduce the complexity of security event detection and analysis. It has reference value for industrial Internet controller to formulate node routing strategy. +Keywords—Industrial Internet, Future network, Big Data, Security Detection +I. INTRODUCTION +Industrial Internet is a name given to the current trend of automation and data exchange in manufacturing technologies. It includes cyber-physical systems, the Internet of things, cloud computing and cognitive computing[1]. It is marked by emerging technology breakthroughs in a number of fields, including robotics, artificial intelligence, nanotechnology, quantum computing, the Internet of Things, the Industrial Internet of Things, fifth-generation wireless technologies (5G), additive manufacturing/3D printing and fully autonomous vehicles. +The fourth wave of the industrial revolution is expected to see the heavy implementation of several emerging technologies with a high potential of disruptive effects [2oÀ3]. +There are many challenges in implementation of Industry Internet, for example: IT security issues, which are greatly aggravated by the inherent need to open up those previously closed production shops. Industrial Internet need to maintain the integrity of production processes. Industrial Internet need to +Liu Lan * +College of Electronic and Information, Guangdong Polytechnic Normal University, + Guangzhou, Guangdong, China, 510655 Email: hust_ll@126.com +avoid any IT snags, as those would cause expensive production outages. And Cloud and data security is a big challenge of Industrial Internet. There are many companies like Symantec, Cisco, and Penta Security have already begun to address the issue of IoT security. +Industrial Internet is the focus of industrial development, and the control system is at the core of the whole industrial system. After the combination of industrial system and Internet, the system architecture has changed from controls-centered to industrial big data as the core [4]. Changes in the industrial Internet architecture have made information and data security very important. Based on the current situation of global industrial Internet development, this paper analyzes the new demands of industrial Internet development on network, studies the collection and integration of industrial big data, and analyzes the data processing and security problems facing industrial Internet in the future. Through the pilot experiments in automotive electronics, 3C manufacturing and other industries, it provides some reference for the future development of industrial Internet network architecture. +II. BACKGROUND AND RELATED WORK +Domestic and foreign researchers attach great importance to the research and application deployment of new technologies and networks, and actively explore the use of IPv6, Internet of things, software-defined network (SDN), 5G and other technologies to build industrial Internet that meets the requirements of high reliability, low delay and wide coverage. Among them, the future network data analysis and security research for the industrial Internet is an important direction that needs attention [5-6]. +The Industrial Internet requires large-scale network infrastructure to provide support, and data-driven network architectures provide possible solutions. For example, in [4], a new network architecture consisting of data plane, control plane, information plane and market plane is proposed, which replaces state complexity with computational complexity. Support data selection through data intelligence, solve + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +978-1-7281-3925-8/19/$31.00 ©2019 IEEE 466 +DOI 10.1109/QRS-C.2019.00089 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +problems that are difficult to optimize in the network through data association analysis, and improve network service quality. +For the heterogeneity of physical implementation technologies and the massive data in the industrial Internet, it is necessary to provide the ability to detect, receive, transmit, and process large amounts of data. In order to realize data processing between heterogeneous networks, a unified interoperability model is needed. Virtualization technology and SDN technology provide ideas for the unified optimization, control, and deployment of heterogeneous network resources [7]. +Industrial Internet is faced with more complex security issues. We need to combine the industry domain knowledge to study new security protection mechanisms suitable for the development of industrial Internet. For the security protection of industrial Internet, more research and exploration pointed out that the typical cyber-physical-system (CPS) architecture supporting Industry 4.0 can be represented by a layered 5C model [8], they are the connection level, Data to information conversion level, cyber level, cognition level, and configuration level. According to the 5C model, the Industrial Internet needs to support flexible devices and sensor networking, real-time reliable information transmission, and efficient big data storage analysis. For the future network security of industrial Internet, it is mainly divided into five aspects: equipment security, network security, control system security, platform security, and data security. The industrial Internet needs to comprehensively analyze and process the big data traffic of heterogeneous systems from five aspects, realize traceability analysis of abnormal/aggressive behaviors, and timely discover abnormal behaviors and alarms in the network. Take appropriate security measures for each level in the platform. +III. RESEARCH ON DATA ANALYSIS OF INDUSTRIAL INTERNET +Based on the industrial Internet network data, this paper combines large data analysis, cloud computing and edge computing to carry out data collaborative analysis of intelligent equipment, forming an overall solution of network manufacturing and industrial Internet, solving the real-time, reliable and safe problems of intelligent manufacturing field network. Research on key technologies such as abnormal product state anomaly detection, trend prediction and fault diagnosis, including heterogeneous multi-source mass industrial big data analysis technology and industrial data security analysis technology. The system framework is shown in Fig 1. +1. Heterogeneous multi-source industrial big data acquisition technology based on CPS +To deal with the huge amount of data generated by the heterogeneous industrial Internet equipment, and to analyze and deal with the large amount of network industrial data, these are all problems that need to be considered in the development of industrial Internet. We need to build an industrial monitoring system oriented to the big data environment, analyze and +coordinate all kinds of heterogeneous and industrial big data, adjust corresponding management and production strategies according to the results, and make the overall industrial network adapt to the dynamic and overall requirements of the big data environment. +Starting from equipment automation and product intelligence, we put forward a heterogeneous terminal architecture integrating distributed perception and reliable transmission, transformed various intelligent equipment required by production, and established a CPS network system. By building a more accurate and efficient data acquisition system, we can comprehensively collect industrial big data and conduct real-time production monitoring. +Realizing the intercommunication of numerical control equipment is the core of the intelligent factory. We realize the data collection of distributed network of numerical control equipment, robots, automatic production lines and other digital production equipment through the Internet technology based on IoT, industrial Ethernet, Zigbee >* Bluetooth and other network technologies. The data acquisition module supports connecting the equipment of different interfaces (such as RS232, RS422, RS485, RJ45, etc.), different communication protocols (TCP/IP, wireless, etc.), different control systems (such as Fanuc, Siemens, Mitsubishi, Heidenheimer, Mazak, Fagor, Agie and other CNC equipment or PLC equipment control system) into a network, and realizing real-time acquisition of equipment status. For machine tools with network CARDS, we can directly collect the real-time status of the machine, program information, the number of pieces of processing, speed and feed, alarm information and other rich information, and collected into the database for further processing. +2. Industrial Data Modeling and Big Data Analysis Technology Based on Domain Knowledge +Spark, Hadoop, Storm and other big data frameworks are widely used in batch and stream processing of massive data. Various machine learning algorithms such as decision tree learning and Bayesian learning, especially artificial intelligence algorithms represented by deep learning and transfer learning, are becoming effective tools for industrial Internet to solve diagnosis, prediction and optimization problems in various fields. +After data collection, merging and cleaning of industrial Internet data, part of redundancy is removed. However, for the whole industrial Internet system, it can only be called initial data. The core data that really needs to be found can be obtained through correlation analysis based on the entire network topology environment, the time and frequency of events, and so on. +We use artificial intelligence algorithms such as machine learning to achieve clustering, correlation and predictive analysis of historical data, real-time data, and time series data. We have accumulated some experience in our previous work +[9]. +In the process of industrial big data processing, we build the industrial big data algorithm library. Through deep + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +467 +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply. + +knowledge of the physical, chemical principles, processes and manufacturing related to the field, the company meets the high confidence requirements of industrial data. +Heterogeneous multi-source Industrial Devices IOT ZigBee TCP/IP Bluetoot Wireless PLC +Raw Data Äfrom different Industrial devicesÅ + + Industrial Data Integration Industrial Data Extraction +Core Data (Standardized) +Filer; Aggregation; Correlation; Normalization +Industrial Data Analysis +Machine learning\Statistics\ Data Mining + +MovingAVG ExpSmooth Copula, trend analysis. Inter- related rules +Domain-Knowledge DB +Automobile 3 Electronics factory C factory +Application and Testing + Fig 1. Industrial Internet data and security analysis framework +The data analysis library uses analytical models suitable for R language and Spark Mlib, such as Copula (commonly used for risk analysis), ExpSmooth (exponential smoothing model, which is a more general predictive model), MovingAVG +(moving average model, commonly used for product demand growth prediction) and Trend (trend analysis) and so on. In addition, there are early warning prediction and rolling prediction services. Visualization technology is used for multi- dimensional analysis and reasoning interpretation to realize visual display of analysis results. According to different scenarios, different analysis methods can be selected to support general analysis interfaces including SQL and Restful services. We study basic domain knowledge and model libraries, maintain data mining analysis programs and model algorithms, and save models and algorithms for easy recall. +IV. INDUSTRIAL INTERNET SECURITY MODEL AND ANALYSIS TECHNOLOGY +In the future network, we use the characteristic data found by the previous research steps to analyze the traffic data in the network nodes and reconstruct the path of network attack. In the process of analyzing the network data packets, the traceability map is constructed according to the relevant path information, and the location of the malicious code is speculated and the attacker is found. At the same time, the spread of network malware on the Internet is a dynamic complex network challenge. +The development of the industrial Internet puts higher demands on network management and network security. However, the traditional network has high hardware coupling and is difficult to expand. It cannot adapt to the changes of the industrial network topology, and it is difficult to meet the flexible and customized requirements of industrial applications. The core idea of SDN is to decouple the control plane and data plane of the network device, and the control function is completed by the controller that masters the global information of the network. With its simple network architecture and strong compatibility, SDN has not only received the attention of academic circles, but also the support of network equipment manufacturers, and has become the focus of research in the network field. +The flexible configuration of the SDN controller is the future development direction of the industrial Internet. Due to the separation of SDN network control and forwarding, loopholes caused by various applications are inevitable. Security issues such as malicious code and DDOS attacks are also faced by the future Industrial Internet. We study the malware traffic characterization model in the Industrial Internet. Through the traffic collection and feature analysis of the industrial Internet flow table data, the matching classification algorithm is found to accurately discover various malicious attacks. We also study the sampling scheme of SDN packet attack detection in the industrial Internet environment. These studies provide a good reference for dynamic security protection under the industrial Internet. +1. Research on dimension reduction method of industrial internet traffic +In the future industrial Internet, key data monitoring can be performed at each node according to the characteristic difference between different data packets of the network node, and the data packet matching the feature value is given a + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +468 +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply. + +response, and the transmission path of the corresponding data packet is obtained. Realize network data traceability. Since the future network is based on flow tables, the flow table can be used as a matching rule for data packets. As the flow table design supports various protocols, the matching is more granular, and the feature values are also increased. Previous studies have shown that most classification or clustering algorithms are not suitable for a large number of high- dimensional sample sets, and cannot quickly complete the determination of large-scale unknown malicious code. We believe that feature selection is an effective method for secure data preprocessing. By reducing the dimension of traffic characteristics, the complexity of security association analysis can be reduced. We pay attention to the application of feature selection method in future network switch traffic data. We use Fisher, ReliefF, mRMR, InfoGain, CFS, LVF and other feature selection methods to sort traffic characteristics and perform comprehensive analysis according to different feature selection algorithms. Effective traffic characterization data is used to build the next model. +2. Research on Optimal Feature Subset and Classification Algorithm Selection of Industrial Internet Security Events +We study the matching degree of different feature selections on algorithm running time and different feature selection methods and classification algorithms. There are many reasons for abnormal traffic, such as DDOS attacks, witty worms, slow scans, etc., which have different performances in traffic characteristics. This project intends to separate the first 8-12-dimensional feature sequences obtained by Fisher, ReliefF, and InfoGain. Combined with different depth learning algorithms, the accuracy of the classification results is calculated, and the best eigenvalues of different types of security event detection and analysis are found. +3. Research on the provenance tracking model of security events for the future industrial Internet [10] +This study establishes the future industrial Internet model, considering the network subnet as a community, the subnet is a static community, and the subnets are dynamic communities. By analyzing the impact of node mobility between communities on the infection and outbreak time of security events on the source and destination subnets in different network models. In the mobile environment, the influence of the spread of malicious code on the evolution of the network is studied. Based on this model, the trace path of the security event is found by constructing the traceability map. In this way, the administrator can analyze each event on the propagation path to provide a theoretical basis for the control strategy of the industrial internet. +4. Research on Attack Packets Sampling Strategy in Industrial Internet Environment Based on Game Theory +We design and simulate an Industrial Internet packet sampling strategy, using zero-sum game and analyzes the security of multiple Industrial Internet topology networks. The Industrial Internet packet sampling problem is modeled as a zero-sum security game, in which both attackers and defenders +participate, and the importance of each point is quantified into the income value. The income of the attackers and defenders are determined according to the income value. Under the knowledge of incomes of attack and defense, we determine the Industrial Internet topology with the highest security performance and security defense strategy. +V. CONCLUSION +Based on the design concept of Industrial Internet and future network, this paper uses the efficiency of deep learning algorithm to analyze heterogeneous data processing and security analysis of industrial internet, and realize data propagation model and event detection method in industrial internet. +We collect industrial data from heterogeneous multi- sources, integrate, clean, and fuse data from data modules and acquisition modules of the Industrial Internet. The project carries out modeling and big data analysis on industrial data based on domain knowledge, and establishes the industrial big data algorithm base. We design professional knowledge acquisition, representation and association methods, in-depth mining domain-related knowledge; By analyzing the traffic characteristics of industrial Internet, the paper studies the selection method of traffic characteristics. Establish the event propagation and evolution model in the future industrial network environment, and build the traceability diagram of security event propagation; In the research process, we proved the effectiveness of the project method through detailed analysis and test application examples, and verified it in automobile electronics and 3C manufacturing industry, so as to accumulate application data for data analysis and network security monitoring under the future industrial Internet architecture. +Acknowledgements +This research is supported by Special project for research and development in key areas of Guangdong Province (2019B010121001),Guangdong Provincial Department of Edu cation Innovation Project(2016KTSCX078) +REFERENCES +[1] The new industrial revolution[R/OL].[2019-03-7]. https://en.wikipedia.org/wiki/Industrial_Revolution +[2] Manekar A K , Pradeepini G . Cloud Based Big Data Analytics a Review[C]// International Conference on Computational Intelligence & Communication Networks. IEEE, 2016. +[3] Lee J , Bagheri B , Kao H A . A Cyber-Physical Systems architecture for Industry 4.0-based manufacturing systems[J]. Manufacturing Letters, 2015, 3:18-23. +[4] Yin H , Jiang Y , Lin C , et al. Big data: transforming the design philosophy of future internet[J]. IEEE Network, 2014, 28(4):14-19. +[5] Sarkar S , Chatterjee S , Misra S . Assessment of the Suitability of Fog Computing in the Context of Internet of Things[M]// The clash of cultures :. Heinemann Educational Books, 2015. +[6] Kreutz D,Ramos F M V,Verissimo P E, et al. Software-Defined Networking: A Comprehensive Survey[J]. Proceedings of the IEEE, 2015, 103(1):14-76. +[7] Hu F . Network Innovation through OpenFlow and SDN: Principles and Design[J]. Crc Press, 2014. +[8] Machii W , Kato I , Koike M , et al. Dynamic Zoning Based on Situational Activate for ICS Security[C]// Control Conference. IEEE, 2015. +[9] Lan L , Jun L . Some Special Issues of Network Security Monitoring on Big Data Environments[C]// IEEE International Conference on Dependable. IEEE, 2014. +[10] Lan L, Ryan K. L.K, Guangming R et al. Malware Propagation and Prevention Model for Time-Varying Community Networks within Software Defined Networks. Security and Communication Networks [J]. +2017. https://doi.org/10.1155/2017/2910310 +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +470 +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:26:53 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt b/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt new file mode 100644 index 0000000..6eb5cc8 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/19-A Model-Driven Architectural Design Method for Big Data Analytics Applications.txt @@ -0,0 +1,151 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2020 IEEE International Conference on Software Architecture Companion (ICSA-C) +A Model-Driven Architectural Design Method for Big Data Analytics Applications +Camilo Castellanos∗, Boris Perez´ ∗†, Dar´ıo Correal∗ Carlos A. Varela +∗System Engineering and Computing Department Computer Science Department University of Los Andes, Bogota,´ Colombia Rensselaer Polytechnic Institute, Troy, NY, USA +Email: cc.castellanos87, br.perez41, dcorreal@uniandes.edu.co Email:cvarela@cs.rpi.edu †Department of Systems +Francisco de Paula Santander University, Cucuta,´ Colombia + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract—Big data analytics (BDA) applications use machine learning to extract valuable insights from large, fast, and hetero- geneous data sources. The architectural design and evaluation of BDA applications entail new challenges to integrate emerging machine learning algorithms with cutting-edge practices whilst ensuring performance levels even in the presence of large data volume, velocity, and variety (3Vs). This paper presents a design process approach based on the Attribute-Driven Design (ADD) method and Architecture tradeoff analysis method (ATAM) to specify, deploy, and monitor performance metrics in BDA applications supported by domain-specific modeling and DevOps. Our design process starts with the definition of architectural drivers, followed by functional and deployment specification through integrated high-level modeling which enables quality scenarios monitoring. We used two use cases from avionics to evaluate this proposal, and the preliminary results suggest advantages by integrating multiple views, automating deployment and monitoring compared to similar approaches. +Index Terms—Software architecture, Attribute-Driven Design, ADD, ATAM, Big data analytics deployment, DevOps, Domain- specific model, Quality Scenarios +I. INTRODUCTION +Big data analytics (BDA) applications use Machine Learn- ing (ML) algorithms to extract valuable insights from large, fast and heterogeneous data. These BDA applications require complex software design, development, and deployment to deal with big data characteristics: volume, variety, and velocity (3Vs) while maintaining expected performance. BDA develop- ment involves three knowledge domains: business, analytics, and technology. In the business domain, business users define business goals and quality scenarios (QS) to drive analytics projects. In the analytics domain, business goals are translated into specific analytics tasks by data scientists. In the tech- nology domain, architects make decisions in terms of tactics, patterns, and deployment strategies addressing QS. The current design approaches do not address this multi-domain nature and complexity involved in BDA application development which frequently leads to delayed deployments [1]. Due to the lack of methods and tools to enable integration and alignment of multiple domains, BDA development presents a costly +The authors would like to thank Amazon Web Services educational research for granting us their cloud resources. +transition between development and production environments (“Deployment Gap” phenomenon [1]). +ACCORDANT [2] is a Domain-Specific Model (DSM) approach to formally specify, develop, deploy, and monitor BDA solutions bridging the gap between analytics and IT do- mains. This paper proposes an extension of the ACCORDANT Method by including architectural inputs (drivers) and aligning to the Attribute-Driven Design Method [3] (ADD 3.0), and to promote the architecture testability following evaluation meth- ods such as ATAM (Architecture tradeoff analysis method) [4]. The proposed method is a model-driven approach that allows us to design, assess, and deploy integrated BDA applications based on architectural drivers: quality scenarios, constraints, tactics and sensitivity points. This proposal was validated with two use cases from the avionics field by designing functional and deployment models, and assessing performance QS in distributed batch and micro-batch processing contexts. The contributions of this paper are: 1) A DSM method to design and evaluate BDA architectures aligned to drivers thus accelerating iterative development and deployment. 2) Three integrated domain-specific languages (DSLs) to specify architectural inputs, functional and deployment view. 3) The experimentation of this proposal on two avionics use cases using different deployment strategies and QS. +The rest of this paper is organized as follows. In Section II describes the background. Section III reviews related work. Section IV details our proposal. Section V describes the ex- perimentation. Section VI reports preliminary results. Finally, Section VII summarizes the conclusions and next steps. +II. BACKGROUND +A. Software Architecture Design +An architecture description is composed of architectural views to address different concerns, and these views are built based on the collection of patterns, templates, and conventions called Viewpoints. The architectural design is driven by QS and functional requirements through a systematic design method, such as ADD [3]), and it could be evaluated using methods such as ATAM [4]. ADD comprises 7 steps: 1) Review inputs (purpose, functional requirements, QS, and constraints). 2) In each ADD iteration, a design goal is defined from these + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +978-1-7281-4659-1/20/$31.00 ©2020 IEEE 89 +DOI 10.1109/ICSA-C50368.2020.00026 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +inputs. 3) Choose systems elements to refine. 4) Choose design concepts to satisfy the selected drivers. 5) Instantiate architectural elements and define interfaces. 6) Sketch views and record design decisions. and 7) Analyze current design and review goal achievement and design purpose, and start a new iteration (from step 2), if selected drivers are not satisfied. +B. Infrastructure as Code and BDA Deployment +Infrastructure as Code (IaC) arises from the necessity to handle the infrastructure setup, evolution, and monitoring in an automated and replicable way through executable specifica- tions. IaC promotes the reduction of cost, time and risk of IT infrastructure provision by offering languages and tools which allow to specify environments, operative systems, middleware, configurationresources and allocate them automatically. Porta- bility plays a key role to deploy, operate, and evolve BDA applications due to the wide range of BDA technologies. Hence, portable standards appear such as Predictive Model Markup Language (PMML)1. PMML models specify machine learning models and data transformations along with their metadata. The PMML standard is supported by a wide range of data science tools such as R, SAS, IBM SPSS, among others. +III. RELATED WORK +Several works have proposed frameworks to build and deploy BDA applications. We review and compare some of the most relevant works in Table I highlighting the important features. In the analytics domain, we compare if they use separation of concerns (SoC), cross-industry application (CI), and support of technology-neutral models (TNM). Regarding software architecture concepts, we include: QS specification (QSS), functional (FV) and deployment (DV) views, tactics (AT), and target-technology assignment (TTA: predefined tech- nologies (P) or extensible code generators (C). Considering DevOps practices, deployment specification (DS) defines if only a number of instances (I) per component or a whole deployment diagram (D) can be described. Finally, practices as continuous deployment (CD), QS monitoring (QSM), and self-adaptation (SA) support IT operations. +Some works have presented DSM to model analytics func- tions, however, they do not tackle architecture concepts and deployment considerations because they are only focused on functional definitions. Lechevalier et al. [5] introduce a DSM framework for predictive analytics of manufacturing data using artificial neural networks to generate analytics models. Sujeeth et al. present in [8] OptiML, a DSL for machine learning which describes analytics functions using a statistical model that covers a subset of ML algorithms, this analytics functions are analyzed and optimized before the code generation. +In contrast, we found another group of studies interested in infrastructure concerns of BDA applications leaving aside their functional components. Gribaudo et al. [6] propose a mod- eling framework based on graph-based language to evaluate the system’s performance of running applications that follow +1http://dmg.org/pmml/v4-3/GeneralStructure.html +the lambda architecture pattern. Huang et al. [7] introduce a model to design, deploy, and configure Hadoop clusters through architecture metamodel and rules, which describe BDA infrastructure and deploy automation. +A final group of works combines functional definitions and deployment specifications. QualiMaster [9] focuses on the processing of online data streams for real-time applications such as the risk analysis of financial markets regarding metrics of time behavior and resource utilization. QualiMaster aims to maximize the throughput of a given processing pipeline. Fastscore [10] is a commercial framework to design and de- ploy analytics models. Analytics components are convention- ally developed using a determined programming language or technology-neutral models, and once imported to the platform, they can be connected to data inputs and outputs. SpringXD +[11] is a unified, distributed, and extensible system for data ingestion, analytics, processing, and export to simplify BDA development and deployment. Finally, the DICE project in +[12] presents a DSM offering big data design that comprises data, computation, technology-frameworks, and deployment concepts to design and deploy data-intensive applications. DICE proposes a model-driven approach to develop applica- tion models that are automatically transformed into IaC. +IV. THE ACCORDANT METHOD +This proposal aims at offering a high-level approach to design BDA solutions starting from architectural artifacts, instead of source code. Specifically, we propose an architecture design and development method based on ACCORDANT [2] framework to deal with architectural drivers, functional, and deployment views. Our proposal comprises a design and deployment method, and its underlying metamodel. This metamodel extends that proposed in [2] by including archi- tectural inputs and serverless deployments. Fig. 1 depicts the ACCORDANT Method steps, which specializes and integrates ADD and ATAM concepts in the BDA domain. +The steps performed in the ACCORDANT framework are framed in solid lines, while the steps made with external tools are in dotted lines. ACCORDANT is iterative and composed of seven steps: 1) Elicitation of drivers (business goals, QS, and constraints) by business users and architects. 2) The data scientist builds and data transformations and analytics models (exported as PMML files) addressing the business goals. 3) The architect designs the software architecture in terms of functional view(FV) and deployment view(DV). FV makes use of PMML models to specify the analytics components’ behavior. 4) FV and DV models are interweaved to obtain an integrated model. 5) Code generation of software and infrastructure is performed from integrated models. 6) The code generated is executed to provision infrastructure and install the software. 7) QS are monitored in operation, and new design iterations can be made to fulfill the drivers. +A. Architectural Drivers Elicitation +According to ADD and ATAM, architecture design and evaluation are driven by predefined quality scenarios (QS) + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +90 +Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply. + +TABLE I +RELATED WORK + +Work SoC Busin ess(Analytics) Softw areArch itectur e De vOps CI TNM QSS FV DV AT TTA DS CD QSM SA Lechevalier et al. [5] +Gribaudo et al. [6], Huang et al. [7] OptiML [8] +Qualimaster [9] +FastScore [10] +SpringXD [11] +DICE [12] C +C P C D +I I D ACCORDANT C D +Fig. 1. ACCORDANT Method Overview + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +91 +Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply. + +which must be achieved through design decisions compiled in well-known catalogs of architectural patterns and tactics. QS and tactics are inputs of the architecture design, therefore we include these initial building blocks in the ACCORDANT metamodel along with other concepts like constraints. Fig. 2 details the main input building blocks grouped by a (Project) which contains the elements required to start the architec- tural design: QS (QScenario), Analyzed QS (AnalyzedQS), SentivityPoint and Tactic. A QScenario determines a quality attribute requirement for a specific Artifact. Thus, for instance, a QS could be defined as “latency< = 3 seconds for an artifact (software component or connector). A QS is analyzed through a AnalyzedQS, and sensitivity points. A SensitivityPoint is a decision’s property (a set of elements and their relationships within architectural views) that is critical for achieving the QS, and that such decision is the application of a Tactic to a specific application context. Finally, Constraints restrict architectural decisions, e.g. mandated technologies, vendors, or processing models. This step covers ADD’s steps 1 and 2. +B. Analytics Model Building +The data scientist build and evaluate data transformations and analytics models using data science tools, which are inde- pendent of ACCORDANT. This approach decouples analytics models and software architecture supported by the portability given by PMML format, but also it enables us to offer an integrated multi-domain framework. +C. Software Architecture Design +Once drivers are defined in step 1, architecture is designed in the step 3 and expressed on the views instantiating tactics + +Fig. 2. Excerpt of Architectural Inputs Metamodel. +in a concrete application. These decisions are associated via SensitivityPoints, and they will be evaluated against the initial QS to validated whether the architecture is achieving its goal. This step spans from steps 3 to 6 in ADD. +Functional View allows us to design analytics pipelines in terms of ingestion, preparation, analysis and exporting building blocks. FV specifies functional requirements of the analytics solution, and the constructs are described in a technology- neutral. FV is expressed in a component-connector model. Sensitivity points can be associated to components and con- nectors to represent where architectural decisions have impact regarding the QS. Component metaclasses are specialized in Ingestors, Transformers, Estimators and Sinks. Estimators and Transformers are the software component realizations of +PMML predictive models and data transformers respectively. A Component exposes required and provided Ports. Connec- tors metaclasses transfer data or control flow among compo- nents through an input or output Roles. A set of connector types are defined: Procedure Call, Event, Stream, Adaptor, Distributor and Arbitrator. +Deployment Viewpointincludes DevOps practices starting with the specification of how software artifacts are deployed on a set of computation nodes. DV metamodel comprises Pod, ExposedPort, and Deployment metaclasses to operationalize BDA applications. A FV model can be deployed in different DV models either to use a different strategy or to test the fulfillment of predefined QS. DV contains Devices, Services, Deployments, serverless environments (ServerlessEnv), and Artifacts. Sensitivity points can be assigned to Deployments and Artifacts to map critical architectural decisions in the DV. Devices (physical or virtual), Pods, and ExecEnvironment) constitute the main elements to provision virtual machines or containers-based infrastructures. On the other hand, Server- lessEnv element describes a computing environment in which the cloud provider dynamically manages the allocation of machine resources. Finally, Artifacts correspond to executable or deployable representations of functional elements (i.e. com- ponents and connectors from FV) which can be deployed on either execution or serverless environments. +D. Integration, Code Generation, and Execution +Once PMML, FV and DV models are designed and in- tegrated, code generation takes place using model-to-text transformations. Code generation is twofold: software and infrastructure (IaC) code. On the software side, each com- ponent and connector is assigned to a specific technology regarding their properties and constraints. Such assignment enables us to generate code for target technology restricted to those constraints. The analytics model’s inputs and outputs are transformed to the component’s interfaces (required and provided respectively). To monitor QS, the code generators include specific machinery at application level to measure specific metrics (e.g. response time, throughput, deadline, etc) for each artifact according to its associated QS. This allows us to reduce code for logging starting from high-level quality specifications. On the IaC side, DV model is transformed into Kubernetes’ configuration files, used to create and configure infrastructure over the Kubernetes where software artifacts can be automatically deployed using the FV-DV mappings. +E. Solution Monitoring +In the last step, the performance metrics of the BDA application are gathered to be compared to initial QS and evaluate the fulfillment of quality requirements. In this step, the architect has to check the outputs and to make decisions in the architectural views. This process can take several iterations, and this is the whole cycle that we expect to accelerate and using ACCORDANT. This ACCORDANT’s step corresponds to analyze drivers’ achievement in ADD (step +7), and to analyze architectural approaches evaluated against each scenario in ATAM. +V. EXPERIMENTATION WITH AVIONICS USE CASES +Our experimentation aims to compare development and deployment time for each iteration with other two frameworks reviewed in Section III: FastScore and SpringXD. We chose these frameworks because they are the closest to our approach, and they support portable analytics models. +We validated our proposal using two use cases: UC1) Near mid-air collision detection, and UC2) Near mid-air collision risk analysis. These use cases are applied to analytics models, they also illustrate BDA facets as streaming and micro-batch to deal with the velocity aspect and batch processing. More details about the use cases can be found in [13], and source code is publicly available2. +Use case 1 (UC1) was applied in aviation safety to detect near mid-air collisions (NMAC) on different air space ranges with different deployment models while performance QS is monitored. NMAC detection comprises a pairwise compar- ison of flights to calculate location, speeds and heading to determine the risk level of NMAC. Eight-hours of data were stored in a distributed file system to be loaded by JSON reader component. This ingestor calls NMAC detector which computes the alert level. Once an alerting level is calculated for each flight pair, the results are sent to the clustering estimator to be associated with a specific cluster, and these results are stored back in the file system. This use case requires a heavy workload nature, and therefore a performance QS for deadlines lower than one hour was defined. +Use case 2 (UC2) is a real-time application to detect NMAC within an air space range. The ingestor component consumed data through direct REST service. Flight data was pushed in a message queue to be consumed by the NMAC detector component which performed the potential collision detection to be finallystored in a relational DB through a message broker connector. It is worth mentioning that the NMAC estimator of UC1 and UC2 are the same, since its inputs, outputs, and behavior are identical, so we can reuse such functional component definition, though their deployments are different regarding the QS constraints. Given the near real-time nature of this application, latency is the critical QS. +A. Architectural Drivers Elicitation +The business goal is to group NMAC events to identify potential risky zones and times within specific air-spaces. A scheduled job to detect risky clusters is processed in batch every day. Fig 3 details drivers expressed using the ACCOR- DANT’s DSL. The NMACDetector component is required to have a deadline lower than 1 hour in the QS UC1 QS1. Ana- lyzing this QS, a sensitivity point (UC1 SP1) is identified to achieve the deadline metric by applying two tactics: introduce concurrency and increase available resources. These tactics will be materialized in the software architecture design. +2http://github.com/kmilo-castellanos/accordant-usecases + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +92 +Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply. + + +Fig. 3. Excerpt of Input Package Models of UC1 Using ACCORDANT DSLs + +Fig. 4. Excerpt of Functional Models of UC1 Using ACCORDANT DSL +B. Data Transformations and Analytics Models +Analytics models were trained and evaluated by the data scientist using Scikit-learn, exported to PMML, and loaded in the ACCORDANT FV model. In this case, the decision tree and K-means models will be assigned in the FV specification. +C. Design of Software Architecture +FV models were designed using ACCORDANT Func- tional DSL to specify a component-connector structure for each use case, Fig. 4 depicts the UC1’s FV model. Since drivers are required in FV, this package is imported us- ing the keyword use. The FV model specified four com- ponents (JsonReader, NMACDetector, NMACClustering, and HDFSWriter), and three procedure call connectors: CallN- MACDetector, CallClustering, and CallWriter which connect the components through ports. Additionally, NMACDetector uses batch processing model, and it has associated “NMAC- TreeModel.pmml” obtained in the previous step. The sensi- tivity point UC1 SP1 aligns the drivers to the NMACDetec- tor as part of the introduce concurrency tactic realization. NMACDetectorwill be translated into a distributed processing component which must be supported by the target technology. +DV models were designed using ACCORDANT DSL for UC1 defined in the FV, see Fig. 5. Given that DV is based + +Fig. 5. Excerpt of Deployment Models of UC1 Using ACCORDANT DSL +on the input package and FV model, they are imported using the keyword use. This view includes the artifacts that map connectors and components from FV to deployable elements in DV. For instance, NMACDetector(see markers A) is mapped to NMACArtifact, and deployed in SparkWEnv (see markers B). Devices and deployments were specified to support the computation requirements. For instance, deployments of Spark master and worker nodes (e.g. SparkWorkerDep) details repli- cas, pods and execution environments (ExecEnv). ExecEnv defines the docker image, resources, and ports along with the artifacts to be deployed. Finally, the sensitivity point UC1 SP1 associates the deployment SparkWorkerDep to performance QS, and the tactic increase available resources (see Section V-A) to support distributed computing over a Spark cluster. +D. Integration, Code Generation, and Execution +Once FV and DV models were designed and integrated, code generators produced functional code and IaC. The target technology selected was Apache Spark, so NMACDetector component implements the PMML model in a Spark driver program. The Spark program defines data input and output from the Data Dictionary and Mining Schema embedded in PMML specifications. On the other hand, the infrastructure code was generated as Kubernetes’ configuration files. Kuber- netes code was executed on the AWS cloud using Amazon Kubernetes and EC2 services. After that, the software code was installed over the cluster to operationalize the solution. +E. Solution Monitoring +Deadline and latency metrics for each use case were collected in operation and validated against QS defined in Section V-A. As a result, different deployment configurations were designed, deployed and monitored in each iteration to monitor the fulfillment of QS. +VI. PRELIMINARY RESULTS +Revisiting the related work reviewed in Section III, we have shown how the ACCORDANT Method fills some gaps + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +93 +Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply. + + +Fig. 6. Development and Deployment Time for Use Case +in BDA architecture. As presented in Fig. I, ACCORDANT follows the SoC principle using three different languages to specify domain concerns. Analytics models in ACCORDANT are cross-industry and technology-neutral. In terms of soft- ware architecture, ACCORDANT supports QS specifications aligned to FV and DV, and these models can be specified independently, but in an integrated way. Code generators promote flexibility and faster development and deployment. Respecting DevOps practice, deployment models allow us to design deployment diagrams and generate IaC to provision such resources semi-automatically. The solution monitoring is aligned to the initial QS specification and implemented by injecting logging code in the generated applications. Finally, self-adaptation is not covered in the current version. +Regarding the development and deployment effort, Fig. 6 depicts the average times invested for UC and two devel- opment teams. These teams developed the UCs using each framework and taking drivers (QS, constraints, and tactics) and the PMML model as input. Each UC was deployed to cloud containers, and the QS monitored using the features offered by each framework. The development time using AC- CORDANT was higher (between 22.7% and 44.4%) compared to SpringXD and Fastscore, but the deployment time was significantly lower (between 50% and 81.8%) using ACCOR- DANT. The higher development time can be explained by the time required to specify architectural inputs and FV models. Besides, the current ACCORDANT prototype generates func- tional code for estimators, but ingestor, sinks, and connectors still require manual coding. Although ACCORDANT required more effort in the development phase, this effort was rewarded during the deployment phase, where infrastructure and QS- monitoring are provided automatically aligned to QS, unlike other approaches. The biggest time differences arose from UC1 that demanded more time because it included a more complex pipeline, involving two estimators. These results sug- gest ACCORDANT is more suitable for application involving multiple iterations, or in subsequent applications where reusing architectural elements can reduce development times. +VII. CONCLUSIONS +We have presented a design method to specify, deploy, and monitor BDA solutions. Two avionics use cases were used to evaluate our approach against two BDA frameworks. As a result, ACCORDANT has shown to facilitate and accelerate iterative deployment by offering an integrated and high-level design BDA applications by investing more effort in the design phase. In contrast, some limitations have emerged from +experimentation. The development phase is slower than the other approaches for multiple reasons. The current version of the ACCORDANT’s prototype requires extra manual coding. ACCORDANT also requires more design details and archi- tectural inputs. These additional definitions are rewarded in consecutive iterations, so ACCORDANT is most suitable for application involving multiple iterations. Finally, our approach takes advantage of reusing architectural decisions and models, hence, first-time or one-time applications may not be benefited from our proposal. +The next steps include a model to predict the expected performance based on FV and DV models, target technologies, and collected metrics to recommend the optimal architecture configuration given a set of drivers. Furthermore, we are developing validation rules to check correctness properties against architectural constraints, e.g. technology conformance, resource availability, and architectural mismatch, taking advan- tage of the integration among drivers, FV and DV. Finally, the experimentation has been performed using containers in the DV, but we expect to include serverless and/or fog computing deployment which can open new challenges. +REFERENCES +[1] H.-M. Chen, R. Schutz,¨ R. Kazman, and F. Matthes, “How Lufthansa Capitalized on Big Data for Business Model Renovation,” MIS Quarterly Executive, vol. 1615, no. 14, pp. 299–320, 2017. +[2] C. Castellanos, D. Correal, and J.-D. Rodriguez, “Executing Architec- tural Models for Big Data Analytics,” in Software Architecture, C. E. Cuesta, D. Garlan, and J. Perez,´ Eds. Cham: Springer International Publishing, 2018, pp. 364–371. +[3] H. Cervantes and R. Kazman, Designing software architectures: a practical approach. Addison-Wesley Professional, 2016. +[4] P. Clements, R. Kazman, M. Klein et al., Evaluating software architec- tures. Tsinghua University Press Beijing, 2003. +[5] D. Lechevalier, R. Ak, Y. T. Lee, S. Hudak, and S. Foufou, “A Neural Network Meta-Model and its Application for Manufacturing,” in 2015 IEEE International Conference on Big Data, 2015, pp. 1428–1435. +[6] M. Gribaudo, M. Iacono, and M. Kiran, “A Performance Modeling Framework for Lambda Architecture Based Applications,” Future Gen- eration Computer Systems, jul 2017. +[7] Y. Huang, X. Lan, X. Chen, and W. Guo, “Towards Model Based Approach to Hadoop Deployment and Configuration,” in 12th WISA. IEEE, sep 2015, pp. 79–84. +[8] A. K. Sujeeth, H. Lee, K. J. Brown, H. Chafi, M. Wu, A. R. Atreya, +K. Olukotun, T. Rompf, and M. Odersky, “OptiML: An Implicitly Parallel Domain-Specific Language for Machine Learning,” in 28th ICML, 2011, pp. 609—-616. +[9] M. Alrifai, H. Eichelberger, C. Qui, R. Sizonenko, S. Burkhard, and +K. Chrysos, “Quality-aware Processing Pipeline Modeling,” QualiMaster Project, Tech. Rep., 2014. +[10] Open Data Group, “FastScore.” [Online]. Available: https://www.opendatagroup.com/fastscore +[11] S. Anandan, M. Bogoevici, G. Renfro, I. Gopinathan, and P. Peralta, “Spring XD: a modular distributed stream and batch processing system,” in Proceedings of the 9th ACM International Conference on Distributed Event-Based Systems - DEBS ’15. New York, New York, USA: ACM Press, 2015, pp. 217–225. +[12] M. Artac, T. Borovsak, E. Di Nitto, M. Guerriero, D. Perez-Palacin, and D. A. Tamburri, “Infrastructure-as-Code for Data-Intensive Ar- chitectures: A Model-Driven Development Approach,” in 2018 IEEE International Conference on Software Architecture (ICSA). IEEE, apr 2018, pp. 156–165. +[13] C. Castellanos, B. Perez,´ C. A. Varela, M. d. P. Villamil, and D. Correal, “A survey on big data analytics solutions deployment,” in Software Architecture, T. Bures, L. Duchien, and P. Inverardi, Eds. Cham: Springer International Publishing, 2019, pp. 195–210. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +94 +Authorized licensed use limited to: Macquarie University. Downloaded on June 23,2020 at 18:40:24 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt b/docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt new file mode 100644 index 0000000..f032de8 Binary files /dev/null and b/docs_to_import/rsl_oliveira2024/2-The_Framework_of_Extracting_Unstructured_Usage_for_Big_Data_Platform.txt differ diff --git a/docs_to_import/rsl_oliveira2024/25-Problem-of-Developing-Fault-Tolerant-High-Loaded.txt b/docs_to_import/rsl_oliveira2024/25-Problem-of-Developing-Fault-Tolerant-High-Loaded.txt new file mode 100644 index 0000000..c695006 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/25-Problem-of-Developing-Fault-Tolerant-High-Loaded.txt @@ -0,0 +1,170 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +On the Problem of Developing a Fault-Tolerant High-Loaded Cluster of Support for an Intelligent Transportation System +Mikhail Gorodnichev, Marina Moseva +Mathematical Cybernetic and Information Technologies +Moscow Technical University of Communications and Informatics Moscow, Russia +m.g.gorodnichev@mtuci.ru; m.s.moseva@mtuci.ru + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. + +Abstract— The study considers methods and means of constructing architectures of big data processing systems for intelligent transportation systems. When developing a large intelligent transportation system (for example, within a large city, region or country), there are issues including redundancy and duplication of stored data. The purpose of this paper is to improve the performance of big data processing system for intelligent transportation system. The work gives an overview of the main approaches and tools for solving problems of development of systems for processing big data, in particular, we considered the conceptual apparatus in the field of ongoing research, analyzed the practical approaches to the distributed storage and processing of big data, and reviewed the theoretical basis of the functioning of data lakes. Also, the work carried out the development of a prototype software system for processing big data for intelligent transport system, in particular, the proposed methodology for building a decentralized ITS, describing the main implemented services, as well as testing the prototype software. +Keywords — big data, intelligent transportation system, fault- tolerant, high-loaded cluster, processing. +I. INTRODUCTION +Current use of the term "big data" tends to refer to the use of predictive analytics, user behavior analytics, or some other advanced data analytics techniques that extract value from big data, and rarely to the specific size of the data set [1]. There is no doubt that the amount of data now available is indeed large, but that is not the most important characteristic of this new data ecosystem. Data set analysis can find new correlations for "identifying trends in business, preventing disease, fighting crime, and so on." Researchers, business executives, practitioners, advertising and government representatives regularly face challenges with big data sets in areas such as Internet search, financial technology, health care analytics, geographic information systems, urban informatics, intelligent transportation systems, etc. [2] +Big data storage, processing, and exchange systems operate under two basic models: centralized (classical) and decentralized (distributed) [3]. Decentralized systems are more reliable and tamper-proof, however, they are more complex and require the presence of well-established mechanisms for +interaction of all system elements. The emergence and rapid development of decentralized systems based on blockchain technology [16, 17] has provoked an explosion of interest in research in this area, and we can assume that this trend will continue in the near future. +The analysis of existing means of decentralized data storage and exchange has shown that in the Russian segment of the Internet as well as in the foreign ones, there are solutions providing the user with data storage and exchange services using cloud technologies (for example, Yandex.Cloud, SberCloud, etc.). However, the vast majority of such solutions when implementing the data storage mechanism, user documents are stored entirely on remote servers, which may lead to data loss in case of incorrect operation of the decentralized system. +The research carried out in this paper considers methods and means of constructing architectures of big data processing systems for intelligent transportation systems. The development of a large intelligent transportation system (for example, within a large city, region or country) raises, including the issues of redundancy and duplication of stored data (including the framework of data lakes). It seems relevant to consider the Raft protocol as the basis for large data processing systems, which allows you to control the number of duplicate data blocks (files, documents, etc.) and notify developers in case of memory shortage (or, for example, problems with servers). As the analysis of Russian literary sources showed, the issues of using Raft protocol when creating decentralized systems for big data processing in the Russian scientific environment are poorly studied, which determines the novelty of this direction. +The multifaceted nature of the topic under study implies the use of regulatory and scientific resources in the field of organization of big data systems, Russian and foreign scientific literature on the general principles of intelligent transport systems, and other topics revealing the theoretical and practical significance of the subject area. The problems of development and research of conceptual foundations of principles of big data storage and processing are mainly devoted to the works of foreign scientists B. Inmon, C. Walker, T. John, P. Misra, P. Simon, I. Terrizzano, P. Schwarz, etc. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. + +979-8-3503-4829-3/23/$31.00 ©2023 IEEE + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. + +II. RELATED WORK +The problems of big data processing are combined in the academic discipline of Data Science [4]. Data science includes methods for processing data under conditions of large volumes and high level of parallelism, statistical methods, methods of intelligent analysis, etc. Data science as an academic discipline can be represented as Euler circles. +Big data is a field that deals with ways to analyze, systematically extract information, or otherwise work with data sets that are too large or complex for traditional application software to handle [5]. +The current use of the term "big data" tends to refer to the use of predictive analytics, user behavior analytics, or some other advanced data analytics techniques that extract value from big data, and rarely to the specific size of the data set. It's worth noting that the amount of data now available is really big, but that's not the most important characteristic of this new data ecosystem. By analyzing datasets, there is an opportunity to find new correlations for "identifying trends in business, preventing disease, fighting crime, etc." [6]. Researchers, business executives, practitioners, advertising and government representatives regularly face challenges when dealing with large data sets in areas such as Internet search, financial technology, health care analytics, geographic information systems, urban informatics, business informatics, etc. +The size and number of data sets available is growing rapidly as data is collected by devices such as mobile devices, Internet of Things information devices, antennas, logging tools, cameras, microphones, radio frequency identification (RFID) readers, and wireless sensor networks [7]. International Data Group Inc. (IDC) reports that global data volume has shown exponential growth from 4.4 zettabytes to 44 zettabytes between 2013 and 2020, and by 2025, data volume could be 163 zettabytes or higher. +Under the real-time mode is understood the mode of information processing, in which the interaction of the information processing system with the external processes in relation to it is provided at a rate commensurate with the rate of these processes. +Examples of the main applications of real-time systems are as follows: +1) onboard equipment of space systems; +2) measurement and control systems; +3) radar and navigation systems; +4) automatic process control systems in industry; +5) banking systems. +Real time systems are divided into hard real time system, HRTS and soft real time system, SRTS. +Hard real-time systems include on-board control systems, emergency protection systems, emergency event recorders, safety systems, monitoring and control systems, etc. Soft real- time systems include interactive systems, vending machines, data processing systems from weather stations, etc. The main difference between hard real time systems and soft real time systems can be expressed in the following: hard real time +system will never be late in reacting to an event, and soft real time system should not be late in reacting to an event. +In the field of big data there is also the concept of Datalake, the idea of which is to store data on the servers of a given "lake" in a raw format [8]. A distributed (decentralized system) is understood as a system in which all servers are the same, i.e., there are no "leaders" and "wards," and the main idea is to combine private servers into a common cluster, which serves as one big server. + +Fig. 1. Functional architecture of ITS + +Fig. 2. Physical architecture of the ITS +An intelligent transport system is a management system that integrates modern information and telematics technologies and is designed for automated search and adoption to implement the most effective management scenarios for the transport and road complex of the region, a particular vehicle or group of vehicles to ensure a given population mobility, maximize road network use indicators, improve safety and efficiency of transport. +The big data technologies underlying Data Science include [9-10]: +1) MapReduce is a distributed computing model used +when processing large data sets in computer clusters or on computers with multicore processors. +2) NoSQL - a number of approaches aimed at +implementing database stores that provide scalability, high availability and flexibility. +3) Hadoop is a set of utilities, libraries and frameworks +for developing and executing distributed programs running on computer clusters. +4) Hardware solutions - configured solutions for +processing large amounts of data. +These technologies implement the basic principles of working with large amounts of data: +a) horizontal scalability (the increase in data volume is +directly proportional to the increase in the number of processed computers forming the computing cluster); +b) fault tolerance (replication of information on several +computers of the computing cluster). +Technologies for processing large amounts of distributed data also lie in the field of scientific research, e.g. Defense Advanced Research Projects Agency - DARPA, Russian Direct Investment Fund, Scientific Research Steering Committee, China, etc. +Practical technologies for processing large amounts of data include, for example, HIVE database management system, Deep Exploration and Filtering of Text system, XDATA system for intelligent processing of large amounts of unstructured data, Big Mechanism system, etc. +For example, the XDATA system aims to solve practical problems by developing computational methods and software tools for processing and analyzing large, unstructured, and incomplete data [11]. During the development of XDATA, distributed database technologies, statistical processing methods, and information visualization. +III. CLUSTERING APPROACHES +The idea behind clustering is to combine two or more servers into one group of servers called a cluster [12]. +The architecture based on a single server is the easiest to understand and implement. As a rule, such architecture plays an important role in proving the relevance of a new concept and the workability of an idea [13]. Implementation of a single- server architecture requires a small amount of computational resources, and most of the time is spent on thinking about the idea itself. +The advantages of using a single-server architecture include: +a) easy implementation and quick deployment; +b) ease of maintenance throughout the entire life cycle; +c) relatively low cost. +The disadvantages of using a single-server architecture can include: +a) low resistance to heavy loads; +b) oversimplification of the system - if you need to +implement macroservices in cloud solutions, you need to completely adjust your deployment approach; +c) does not support multiple services simultaneously, +limitations are imposed by the number of cores in the servers; +d) because multiple services use the same processor, +one service can affect the performance of another. +An architecture based on several servers has the notion of multi-server. In the case of solving the problem of paralleling calculations in database management systems and others for multiprocessor platforms it is necessary to run several database servers, including those on different processors (and each of the servers should be multithreaded). This model is called multithreaded multiserver architecture and is related to paralleling the execution of a single user query by several server processes. + +Fig. 3. Variant of data module interaction structure within a distributed ITS +The Raft algorithm is considered in the scientific community as a fairly simple and proven approach used in building both decentralized repositories [14, 15]. +The advantages of the Raft approach include: +a) there are only three states for cluster servers: +Follower, Candidate, Leader; +b) simple functionality to implement; +c) a proven solution; +d) High resilience when servers fail. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. + +The disadvantages of the Raft approach include: +a) an additional layer of data management; +b) it is quite difficult to detect defects in the system; +c) the results of individual scientific studies show a +slower performance than, for example, when using the TCP protocol together with TLS. Despite these disadvantages, the Raft algorithm offers a conceptual idea that ensures the reliability of the most decentralized system of big data processing. +IV. ARCHITECTURE OF A BIG DATA PROCESSING SYSTEM FOR AN ITS +The task of this paragraph is to elaborate in as much detail as possible the architectural issues of functioning of intelligent transport systems using big data technologies. The architectural options presented below are purely theoretical in nature, however, it seems appropriate to conduct this generalizing study for future developers of transportation systems. +The proposed version of the reference architecture of the Big Data Processing System for the Intelligent Transportation System (EASOBD-ITS) allows to identify ways of planning, developing and deploying applications in the subject area under consideration and to facilitate the implementation of big data analytics solutions for transportation organizations. EASOBDD-ITS contains a description of the system to be deployed, including the technology stack and integration protocols EASOBD-ITS includes the stages of data collection, storage, extraction, processing and use, agreed with the ITS domain services. + +Fig. 4. UML-diagram of storage classes +The reference architecture is described by representations, each reflecting the problems of a particular system. The representations facilitate summarization and discussion of architectural issues by stakeholders. Specifically, the representations included in the EASOBD-ITS are functional, process, and integration. +The functional representation describes services (sets of common functions), connectors (communication between services) and groups of services. The representation area (A) defines how services provide information through channels. +Zone (B) describes the server services that integrate the considered ITS reference architecture with Geographic Information Services (GIS). The Analytics Zone (C) describes the types of analyses that must be performed and maintained in the Data Storage Zone (D) and the Analytical Sustainability Assessment Zone (E). The Analytical Stability Assessment Zone offers services such as distributed file system, SQL, NoSQL storage, etc. The Consumption Zone (E) is responsible for collecting data from external sources and redirecting it to the appropriate consumers. + +Fig. 5. Algorithm of the service agent +The integration view is used to describe each type of connector at the transport, distribution, intermediary, and application layers. This connector describes the recommended protocols for linking external resources at the integration and transport layer. These include the protocols HTTP, FTP, WebSocket and MQTT. The protocols should be chosen based on the communication scheme (connection-oriented or subscription/publication-based) and their endpoints (web application, database, IoT devices, FTP server, etc.). +The technology selection view provides a list of recommendations for specific products that offer the capabilities needed for the service or group of services that need to be created. This view is a guide of sorts when selecting solutions in a particular implementation. +V. PRIVATE VERSION OF THE ARCHITECTURE +A special case describes an intelligent system for analyzing traffic accidents and their dependence on traffic fines. The use of a web application is recommended for the visibility of accident analysis and monitoring. The analysis module takes + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. + +data on road incidents and their relationship to traffic fines to support road safety decision-making. The work monitoring module monitors bus traffic in the public transportation system to apply mobility models. These modules are combined under a single interface, allowing users to receive information about accidents on the roads, bus routes, as well as their geolocation data and speed. Forecasting includes the calculation of a risk index for each road section and the correlation of accidents and traffic fines. As a result, ITS management is able to analyze the road situation and make decisions to ensure road safety. +Incidents, traffic tickets and road networks are external data sources loaded via the AccidentsETL component, which is part of the EASOBD-ITC Zone E and G services and is implemented using Python, Pandas and PostGIS software tools. In terms of quantity and quality the data includes: fines for violations, traffic incidents, GPS-tracking data of buses, and road networks and their graphs. As new data arrives, it is filtered and stored in the MondoDB database, which corresponds to Zone D. In addition, bus data is downloaded and merged via the OperETL component of the Spark software in zones E and D. The resulting merged data is also stored in the MongoDB database. In zone B, the AccidentBackend and OperBackend components access and aggregate pre-processed incident data using Python and then provide the results to the frontend component via REST. The AccidentDashboard and OperMonitoring frontend components are in turn implemented using the Angular Dashboard Framework (ADF) tool, AngularJS, C3, D3 and Leaflet in Service Area A. +VI. METHODOLOGY FOR BUILDING A DECENTRALIZED ITS +Decentralized ITS, managed via the Raft algorithm, is fully automated, with the addition of new servers performed by an agent, which can request, for example, a cloud provider for an additional server, and then connect it to the main cluster. Thus, using the agent service it is possible to connect new servers. +For efficient development and maintenance, a decentralized ITS, managed via Raft algorithm, has a microservice architecture. Microservice architecture is a variant of service- oriented software architecture, aimed at interaction as much as possible of small, weakly connected and easily changeable modules - microservices. +The storage service, which can run on a separate server as well as on any server with a database, provides two main tasks: +a) users receive targeted information about the state of +the transport network from the ITS; +b) saving unstructured data from various sources +(agents) of ITS (smartphones, multimedia devices of cars and public transport, smart traffic lights, video cameras, etc.) of different formats to ITS. +The storage service transfers unstructured blocks of data to the database service for storage. It is in communication with the auth, database, agent, and client services. + +Fig. 6. General scheme of service interaction +Service auth, is responsible for authentication and authorization in the decentralized ITS. This service allows new users to register in the system, authorize users by issuing tokens to the storage service, connect ITS agents (smartphones, multimedia devices of cars and public transport, smart traffic lights, video cameras, etc.). Located in interaction with storage services, client. + +Fig. 7. Block diagram of agent state transition +The database service is responsible for data storage in the decentralized ITS. To ensure reliable operation, it implements the Raft algorithm. It is in interaction with the storage service. +The agent service is responsible for adding new ITS servers for the database service. Allows you to request an additional server from the cloud provider and start a new database service on it. Notifies the storage service about adding a new server. Interfaced with database and storage services. +The client service includes two main modules: +a) a desktop application that enables end users to +retrieve targeted information about the state of the transport network from the ITS; +b) ITS agent libraries that provide storage for +unstructured data. +Located in interaction with auth, storage services. In order to implement a cluster using the Raft algorithm, it is necessary to implement a communication protocol in a decentralized ITS. The main feature is that each agent must work in both directions and at any time can be both in the follower state and in the leader state, already relative to its state the instructions of its functionality must change. +In the prototype software elements of a decentralized ITS, the architecture includes five microservices: client, storage, auth, database, agent. These macroservices are sufficient to be located on a single server, but to improve the performance of the decentralized ITS, it is recommended to put each of the services on each server separately. +VII. CONCLUSION +In this paper, we investigated the problems of improving the efficiency of big data processing system for intelligent transportation system. +During the work the following partial tasks are solved: the conceptual apparatus in the field of ongoing research is formulated; the analysis of practical approaches to the distributed storage and processing of big data is carried out; the analysis of the basis for the functioning of technology lakes data, the development of a reference architecture for large data processing system for intelligent transport systems has been implemented; the development of private versions of architectures to solve individual problems of intelligent transport systems has been implemented; the development of a method. +ACKNOWLEDGEMENTS +The reported study was funded by RFBR, project number 19-29-06036. +REFERENCES +[1] A. Amrani, K. Pasini, M. Khouadjia "Enhance Journey Planner with Predictive Travel Information for Smart City Routing Services". Forum +on Integrated and Sustainable Transportation Systems (FISTS). IEEE, 2020, pp. 304-308. +[2] N. Cao "Revisit Raft Consistency Protocol on Private Blockchain System in High Network Latency". International Conference on Artificial Intelligence and Security. Springer, Cham, 2021, pp. 571-579. +[3] T. John, P. Misra "Data Lake for Enterprises". Packt Publishing Ltd, 2017. +[4] G. Georgie, Donnelly "Future attacks". OREILLY, 2013, pp.76-94. +[5] M. Kastouni, A. Lahcen "Big data analytics in telecommunications: Governance, architecture and use cases". Journal of King Saud University-Computer and Information Sciences, 2020. +[6] T. Nakagawa, N. Hayashibara "Resource management for raft consensus protocol". International Journal of Space-Based and Situated Computing, 2018, Vol. 8, No. 2, pp. 80-87. +[7] H. Netto "Incorporating the Raft consensus protocol in containers managed by Kubernetes: An evaluation". International Journal of Parallel, Emergent and Distributed Systems, 2020, Vol. 35, No. 4, pp. 433-453. +[8] A. Olawoyin, C. Leung, A. Cuzzocrea "Open Data Lake to Support Machine Learning on Arctic Big Data". IEEE International Conference on Big Data (Big Data), IEEE, 2021, pp. 5215-5224. +[9] R. Singh "Highway 4.0: Digitalization of highways for vulnerable road safety development with intelligent IoT sensors and machine learning". Safety science, 2021, Vol. 143, pp. 105-116. +[10] N. Stojanović, D. Stojanović "Big Mobility Data Analytics for Traffic Monitoring and Control". Facta Universitatis. Series: Automatic Control and Robotics, 2020, Vol. 19, No. 2. pp. 087-102. +[11] C. Walker, H. Alrehamy "Personal data lake with data gravity pull". IEEE Fifth International Conference on Big Data and Cloud Computing, IEEE, 2015, pp. 160-167. +[12] E. Tourouta, M. Gorodnichev, K. Polyantseva, M. Moseva "Providing Fault Tolerance of Cluster Computing Systems Based on Fault-Tolerant Dynamic Computation Planning". Digitalization of Society, Economics and Management. Lecture Notes in Information Systems and Organisation, vol 53. Springer, Cham. DOI:10.1007/978-3-030-94252- 6_10 +[13] E. Kukharenko, I. Korkunov, M. Gorodnichev, T. Salutina "On the Introduction of Digital Economics in the Transport Industry". Systems of Signals Generating and Processing in the Field of on Board Communications, 2019, pp. 1-5. DOI: 10.1109/SOSG.2019.8706797. +[14] M. Moseva, M. Gorodnichev, K. Polyantseva, A. Sheremetev, K. Dzhabrailov "Development of a Platform for Road Infrastructure Digital Certification". Intelligent Technologies and Electronic Devices in Vehicle and Road Transport Complex (TIRVED), 2021, pp. 1-8. DOI: 10.1109/TIRVED53476.2021.9639102. +[15] M.S. Moseva "About methods for collecting and analyzing traffic flow characteristics," T-Comm, vol. 16, no.2, pp. 29-38, 2022. +[16] N.E. Konstantinov, M.G. Gorodnichev, R.A. Gematudinov "Blockchain as an IоT development platform," T-Comm, vol. 12, no.9, pр. 63-68, 2018. +[17] M.G. Gorodnichev, S.S. Makhrov, E.N. Denisova, I.D. Buldin "Application of blockchain technology to provide protection and control of wireless sensor network nodes," T-Comm, vol. 12, no.7, pр. 64-68, 2018. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on May 17,2024 at 14:44:04 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/27-Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development.txt b/docs_to_import/rsl_oliveira2024/27-Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development.txt new file mode 100644 index 0000000..c56d4cc --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/27-Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development.txt @@ -0,0 +1,194 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development +Daniel Staegemann https://orcid.org/0000-0001-9957-1003 +, Matthias Volk https://orcid.org/0000-0002-4835-919X +120 +Staegemann, D., Volk, M. and Turowski, K. +Adapting the (Big) Data Science Engineering Process to the Application of Test Driven Development. +DOI: 10.5220/0011289200003280 +In Proceedings of the 19th International Conference on Smart Business Technologies (ICSBT 2022) , pages 120-129 ISBN: 978-989-758-587-6; ISSN: 2184-772X +Copyright c 2022 by SCITEPRESS – Science and Technology Publications, Lda. All rights reserved + and Klaus Turowski +Magdeburg Research and Competence Cluster VLBA, Otto-von-Guericke University Magdeburg, Magdeburg, Germany +Keywords: Big Data, Data Science, Software Engineering, Big Data Engineering, Test Driven Development, TDD, +Process, BDSEP. +Abstract: Knowledge, information, and modern technologies have become some of the most influential drivers of +today’s society, consequently leading to a high popularity of the concepts of big data (BD). However, their actual harnessing is a demanding task that is accompanied by many barriers and challenges. To facilitate the realization of the corresponding projects, the (big) data science engineering process (BDSEP) has been devised to support researchers and practitioners in the planning and implementation of data intensive projects by outlining the relevant steps. However, the BDSEP is only geared towards a test last development approach. With recent works suggesting the application of test driven development (TDD) in the big data domain, it appears reasonable to also provide a corresponding TDD focused equivalent to the BDSEP. Therefore, in the publication at hand, using the BDSEP as a foundation, the test driven big data science engineering process (TDBDSEP) is proposed, facilitating the application of TDD in the big data domain and further enriching the discourse on BD quality assurance. +1 INTRODUCTION important, the focus of the publication at hand is on +the latter. Despite the popularity of BD, the Knowledge, information, and modern technologies corresponding quality assurance is not yet mature and have become some of the most influential drivers of new approaches, methods and tools are still being +actively explored. One example of this is the tCoodnasye’qs uensotlcyi,e tthye c(oLnecveipnt s oafn bdi g dMataam (BloDk ) a2n0d2 b1i)g. adaptation of the test driven development (TDD) +data analytics (BDA) are extremely relevant and approach to the BD domain (Staegemann et al. promising for many organizations across varying 2020b). This promises to bring several benefits, such domains and sizes. The potential applications and as an improvement to the developed systems’ quality, desired benefits are manyfold (Poleto et al. 2017; van a subsequent increase of trust by the users, and also der Aalst and Damiani 2015). This includes, for more flexibility when it comes to the adaptation of the instance, customer relation management, marketing, applications to new requirements and changes to the managerial decision support, improvements to relevant environment. However, to our knowledge, +there is no guideline on how to structure the mgeanienrtaetniaonnc oe f aindde assu apnpdly i ncshiagihnt sm foanr atgheem eexnptl,o iotar titohne corresponding activities for the test driven +implementation of a BD project. Yet, in the form of ohfa rnneewss inmga riks eat sd eanmda npdriondgu ctatss.k Hthoawt eisv earc, ctohme paacntuieadl the (big) data science engineering process (BDSEP), +by many barriers and challenges. The main factors as proposed by Volk et al. (2020a), there is one for influencing the obtained results are the quality of the general BD endeavours. Therefore, it appears used data, the competence and willingness of the reasonable to adapt it to the application of TDD. For responsible users, and the quality of the application’s this reason, within this work, the following research implementation (Janssen et al. 2017; Staegemann et question (RQ) shall be answered: +al. 2019a). While all those aspects are highly + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +ICSBT 2022 - 19th International Conference on Smart Business Technologies +RQ: How can the (big) data science engineering characteristics, but also the questions that shall be process be adapted to the application of test driven answered through the use of BD, as well as the data’s development? content can change over time (Katal et al. 2013; To answer the RQ, the publication at hand is Staegemann et al. 2020a; Wu et al. 2014). structured as follows. After this introduction, the most Besides those four characteristics, there are, relevant terms and concepts are outlined in the however, further aspects that are relevant in the BD background section. Afterwards, the BDSEP is context. The quality of the used data is, for example, presented in a separate section to account for its extremely important and has huge impact on the significance in the course of this work. This is analysis results (Hazen et al. 2014). Moreover, followed by the development of the adapted process besides the data, BDA combines organizational, that supports the application of TDD. Finally, in the human, and further technical aspects (Alharthi et al. concluding remarks, the proposed artifact is further 2017). The latter is emphasized through a plethora of discussed, the presented work is recapitulated, and available tools and techniques (Turck and Obayomi avenues for future research are outlined. 2019), which renders it hard to make the right choice, +when it comes to the technology selection (Volk et al. +2021). Finally, due to the potentially high impact of 2 BACKGROUND the BDA applications on the success of the applying +organizations (Müller et al. 2018), and the resulting To facilitate a common understanding of the relevant need for trust and appreciation by the responsible terms and concepts, those are in the following briefly decision makers to assure correct use (Günther et al. outlined to establish a solid foundation for the 2017), comprehensive quality assurance is of utmost remainder of the publication at hand. importance for the corresponding endeavors (Gao et +al. 2016; Ji et al. 2020; Staegemann et al. 2021b). +2.1 Big Data 2.2 Big Data Engineering +Despite big data being one of today’s big trends +(Ghasemaghaei and Calic 2020; Volk et al. 2020b), As a consequence of the aforementioned big data and consequently also intense scientific discourse characteristics, the implementation of the (Staegemann et al. 2019b), there is still no universally corresponding systems significantly differs from used definition for the term itself. In fact, not even the conventional IT projects, since there needs to be a origins of the term are completely clear (Diebold huge focus on the handling and interpretation of data. 2012). This often increases the development’s complexity. +However, the definition that is provided by the The term “big data engineering” (BDE) describes the National Institute of Standards and Technology entirety of the activities that are associated with the (NIST), is widely acknowledged, and therefore also creation of those BD systems (Volk et al. 2019). This relied upon for the publication at hand. It states that field that is in the intersection of big data, data big data “consists of extensive datasets primarily in science, and systems engineering includes numerous the characteristics of volume, velocity, variety, and/or tasks in several phases. In the beginning, there is the variability that require a scalable architecture for project planning with steps like the requirements efficient storage, manipulation, and analysis” engineering (Altarturi et al. 2017). This is followed (Chang and Grady 2019). by the actual design and implementation, including +Here, volume indicates the amount of data, aspects like the technology selection (Lehmann et al. regarding the number and/or size of files, that have to 2016). Finally, the solution’s deployment ensues. be processed by the corresponding applications Additionally, the aspect of quality assurance has to be (Russom 2011). Velocity refers to two aspects, the considered. +speed with which the data are incoming and the To facilitate the BDE process and support timeliness that is expected for the application’s results practitioners as well as researchers in the realization (Gandomi and Haider 2015). Variety addresses the of their BD endeavors, Volk et al. (2020a) have data’s heterogeneity, which is, inter alia, expressed developed the (big) data science engineering process through it being differently structured (structured, (BDSEP) that outlines the sequence of activities when semi-structured, unstructured), the use of varying creating such a BD application. +units of measurement and formats as well as different +contexts it originates from (Gani et al. 2016). Finally, +by variability it is expressed that the aforementioned +2.3 Test Driven Development 2.4 Microservices +As shown by the literature, the application of TDD is The idea behind the microservice concept is to a way of increasing a developed application’s quality partition the developed application into multiple (Staegemann et al. 2021a). This is mainly based on smaller services, which subsequently cooperate to two aspects. By the corresponding increase of the test solve the given task (Nadareishvili et al. 2016). coverage, the detection of errors is facilitated. Oftentimes, those services are constructed to provide Further, the design of the developed system is also a certain business functionality. This allows for a high influenced. The latter effect is caused by TDD heavily degree of specialization in the implementation. relying on the decomposition of the developed Each microservice runs in its own process. As a application into possibly small pieces. Due to the consequence of their independent nature, their correspondingly decreased complexity, it is easier to implementation can also be heterogeneous avoid errors and, additionally, the maintainability is (Freymann et al. 2020). Therefore, the responsible also increased (Crispin 2006; Shull et al. 2010). developers of each microservice can autonomously +While usually features are planned, implemented decide on the utilized technology stack and and then tested, this order is changed when applying programming languages. To enable the TDD. After the first step, which now also puts communication among the services, only lightweight emphasis on breaking down the envisioned solutions are used. Due to their properties, functionality into small, capsulated parts (Fucci et al. microservices can be separately deployed and used. 2017), the writing of the tests follows. To assure that To automate the former, it is common to use they indeed test new aspects, they are subsequently continuous deployment tools and pipelines. +run, with the expectation to fail, since the actual While, in software engineering, achieving a high implementation has not yet happened (Beck 2015). degree of modularity is not only considered desirable, Consequently, based on that premise, in case they but also challenging (Faitelson et al. 2018), the use of pass, they have to be reworked. Once the tests are set microservices facilitates this task, since it is achieved up, the real implementation happens, enabling the by design. Moreover, when changes are implemented, new functionality. Here, aspects like the elegance of it is often sufficient to only redeploy the respective the code or the adherence to conventions can be microservice instead of the entire system. As a result, ignored, as long as the tests pass (Crispin 2006). Only the effort for maintenance as well as for modifications afterwards the codes overall quality is improved is reduced. This, in turn, promotes an evolutionary through refactoring (Beck 2015). This is supported by design with frequent and controlled changes the previously written tests that help to detect if new (Krylovskiy et al. 2015). +errors were introduced during this procedure. As +stated previously, this overall process with its focus 2.5 Test Driven Development in Big +on incremental changes and small tasks (Williams et Data +al. 2003) not only impacts the test coverage and +provides the developers with faster feedback, due to Since BD applications are highly complex and also shorter test cycles (Janzen and Saiedian 2005), but extremely quality sensitive, while TDD is capable of also heavily influences the developed solution’s improving a developed application’s quality, its design (Janzen and Saiedian 2008). application in the BD domain appears obvious. As the Usually, unit tests are the backbone of TDD. technical foundation for the concrete realisation, the However, those are supposed to be complemented by use of microservices has been proposed (Staegemann other types of tests such as integration or system tests et al. 2020b). This is based on the strong synergy that (Sangwan and Laplante 2006), with especially the exists between the concept of microservices and the former being seen as essential (Kum and Law 2006). breaking down of the desired applications into Moreover, it is common to use continuous integration possibly small parts as it is core of the TDD (CI) pipelines when applying TDD to enable test methodology (Shakir et al. 2021). By utilizing automation and, therefore, assure a high test microservices, each business functionality can be frequency without the need for the developers to designed as a separate service that can also be cumbersomely run the tests manually (Karlesky et al. independently scaled to correspond to the arising 2007; Shahin et al. 2017). In doing so, once a change workloads. This also allows to distribute the to the code is made, the existing tests are run by a CI development across different teams that can act server to check if any new errors have been mostly independent of each other and are further free introduced. to use the technologies and tools of their choice + +Figure 1: The (Big) Data Science Engineering Process (BDSEP) (Volk et al. 2020a). +instead of having to find an overarching consensus as considerations regarding the necessary data and a it would be needed for a monolithic solution. clear definition of the objectives. Subsequently, the +Since the created tests enable the developers to requirements engineering is performed, determining easily and immediately validate the functionality of the functional and non-functional requirements as any changes to the system, TDD also increases the well as possible constraints and the respective flexibility of BD applications, since it is easier to priorities. +implement changes to adapt to new needs and In the second phase, the architectural changes in the application environment. However, specifications are defined. This includes aspects such due to the inherent complexity, the application of as the system’s components with their in- and outputs, TDD in the BD domain is a challenging task with the the intended communication, and the available research on it being not yet very mature. To interfaces. Then, the system design is conducted. The somewhat reduce the complexity and support previously determined components are further researchers and practitioners in realizing their own specified, the most suitable technologies are chosen, endeavours, the use of a corresponding process model and the deployment plan is crafted. For those tasks, that helps to structure the necessary activities appears the harnessing of reference architectures (Ataei and to be sensible. Litchfield 2020), best practices (Pääkkönen and Pakkala 2015), and decision support systems (Volk et al. 2019) is explicitly highlighted as advisable. Once 3 THE (BIG) DATA SCIENCE the design is finished, the system’s construction can ENGINEERING PROCESS take place. Apart from its development, the applications running on it are programmed and the +(BDSEP) necessary algorithms are developed or integrated. The testing of the created solution constitutes the +To facilitate the introduction of BD applications and third phase of the process. Here, it is identified, what overcome the challenges of BDE, Volk et al. (2020a) should be tested, the corresponding test cases are have proposed the BDSEP. By combining knowledge constructed, subsequently run and the results are and practices from information systems engineering evaluated. This applies to each component as well as insights into data science processes, they individually as well as to the system as a whole. +crafted the BDSEP to support researchers and Once all the tests are passed, the delivery as the practitioners in the planning and implementation of fourth phase succeeds. For this distribution of the data intensive projects by outlining the relevant steps, solution to the target environment it is highlighted, needed for the corresponding endeavours. that, due to its complexity, a staged process should be On a high level, the BDSEP comprises four main chosen (Chen et al. 2015; Mobus and Kalton 2015) to phases, namely project planning, design and detect unforeseen issues. Therefore, this procedure development, testing, and delivery. While those as should also be comprehensively monitored +well as the steps described in the following, are Finally, those four main phases of the BDSEP are generally performed in the given order, it is always followed by the system’s actual operation, including possible to go back to previous activities if deemed the necessary maintenance and at the end of its necessary. lifetime also its decommissioning. While it is not +The first phase begins with the need to formulate strictly a part of the engineering and is, therefore, also a general idea or vision what shall be achieved by not seen as part of the main phases, it is evidently introducing a new system. This is followed by a more highly relevant with respect to the success of the in-depth analysis of the concrete use case, including developed application. +An overview of the process in its entirety is given To create a process that is geared towards the in Figure 1, which is heavily based on the original application of TDD, it is necessary to account for depiction in (Volk et al. 2020a). those levels, since having only one generic test While the BDSEP in its current form fits to the activity as in the BDSEP is no longer sufficient. +needs of many BD endeavours, it is clearly geared However, the initial considerations regarding a towards a test last development (TLD) approach, BD project remain the same, independently of the where the testing only follows the implementation. decision if a TLD or a TDD approach is chosen, since For the application of TDD, there is, to our the respective particularities only come into play once knowledge, currently no similar proposition. a rough concept for the desired product is devised. +However, while there are significant differences Therefore, the first phase of the BDSEP, the between TLD and TDD, major parts of the BDSEP project planning, can be carried over to the appear to be still applicable, which makes it TDBDSEP without the need for modifications. This reasonable to use it as a foundation for the means, that, again, at first the rough idea or vision for development of this work’s contribution, the test the project is formulated, based on the perceived driven big data science engineering process problem or need that caused its inception. This is (TDBDSEP). followed by a more in-depth analysis of the use case. Here it is clarified, which objective should be +fulfilled, and the corresponding specifics (e.g., time, 4 ADAPTING THE BDSEP TO location, or stakeholders) are discussed. Moreover, it TDD (TDBDSEP) is determined which data should be used for which purpose, where they come from, what their +characteristics are, and which implications come from To create the TDBDSEP, two pillars are built upon. this (e.g., if orchestration or harmonization of +Those are the BDSEP (Volk et al. 2020a), which is different data sources is necessary). Afterwards, the used as the foundation, as well as the concept and requirements engineering is performed, comprising terminology for using TDD in the BD domain functional and non-functional ones, including the (Staegemann et al. 2020b). One important aspect of corresponding prioritization, but also aspects such as the latter is the consideration of different levels when the incorporation of constraints and a feasibility regarding the developed solution. Besides the system analysis. +level, there are the component level, the sub- Following the project planning, an entirely new component or microservice level, and the method second phase is introduced, which deals with the level. The latter deals, according to its name, with the success definition. For this purpose, the criteria to separate methods and functions, that are implemented evaluate if the aspired goals of the implementation in the course of the project, without considering how have been achieved are determined. This entails, for their role in the bigger picture. In the microservice instance, which inputs should lead to which outputs, level, the services in their entirety are regarded. The but also the general system behavior as well as any services, in turn, are the building blocks of other aspects that are deemed relevant and can be components. Those are (virtual) units that are evaluated. In the subsequent activity, the contentually connected due to their functionality. corresponding test cases for the system as a whole are Examples for such components could be the import constructed. Those might be automated tests, but also of data when it is realized by multiple services that manually conducted ones. Since this activity is are specialized to get data from one specific (type of) primarily geared towards the actual implementation source or the utilized data’s pre-processing, if it in daily production and the intended users’ comprises various steps that are implemented as perspective, relevant business stakeholders, such as discrete microservices. However, there are no clear managers, domain experts, and targeted decision rules for the definition of the components. It depends makers should be heavily involved. +on the respective developers and their evaluation of The third phase is heavily leaning on the second the developed system. Furthermore, a microservice phase of the BDSEP, yet some adjustments come into can be part of multiple components, but always at play. Because the term component in the BDSEP has least belongs to one and each component consists of not exactly the same meaning as the term has in the one or many sub-components. Finally, on the system context of the above introduced terminology, it is level, the developed solution is regarded as a whole, replaced with the word “element”. Yet, the definition which could be seen as the equivalent of a monolithic of the components is also newly introduced. Further, implementation (Shakir et al. 2021). since one of the big advantages of microservice +architectures is the option to conduct the actual its concept, the first task is to prepare the evaluation development in a distributed fashion, once the of the parts that shall be developed next. This is done underlying architecture and design are known, design in two activities, one on the component level and, and development are detached from each other. For thereafter, one for the microservices. Once those are this reason, the design is a separate phase that set up, the actual implementation of the chosen contains two activities, namely the definition of service can take place. In contrast to the BDSEP, the architectural specifications and the system design. technology selection only happens now, allowing for Those are mostly identical to the corresponding more autonomy in the construction process. Further, activities from the BDSEP. Yet, the preparation of the the service is created in a test driven fashion, which implementation plan is explicitly introduced because makes the unit testing of its internal functions a key of the additional complexity due to the distributed aspect. Again, for all the described activities, it is nature. Further the technology selection no longer possible to go back to the previous one if it is deemed happens during the system design and is postponed sensible. After the construction is completed, the instead, because this decision is up to the developers execution of the prepared tests ensues. This of the respective microservices. This way, following comprises three activities. In the first one, the tests for the idea behind the microservice concept, each team the microservice are run. If they don’t pass, the can make the most sensible choice with respect to the process goes back to the construction activity. task, the members’ skills, preferences, or other factors Otherwise, there are two options. Either there are still that are considered relevant. As during the project more services to be constructed in the component, planning and success definition, it is again possible to then the corresponding tests for the next one are go back to the prior activity if an issue or an oversight written and it is subsequently constructed, or this was becomes apparent. the last service in the component, which leads to the The TDBDSEP’s fourth phase, development and next activity. There, the test cases that were created testing, constitutes the biggest deviation from the for the component level are run. If they fail, the next foundational BDSEP. Even though it is somewhat the step would be to go back to the test creation for the counterpart to the second aspect of its design and microservice that is identified as responsible, since development phase as well as the testing phase, the apparently some aspects have not been sufficiently TDD approach causes significant changes. Following reflected by the existing tests for it. In case of success, + +Figure 2: The Test Driven Big Data Science Engineering Process (TDBDSEP). +there are again two options. If there are more 5 CONCLUDING REMARKS components that need to be implemented, the tests for +the next one are written, which is followed by the With big data becoming more and more important subsequent steps. Should this have been the last regarding both, the prevalence of its application as missing piece for the system, the final evaluation can well as the importance within the utilizing take place as the third activity of the test execution. +There, the available tests for all the components and oacrgtiavnei.z Tathiiosn asp, pthliee sr,e floatre idn sstcainecnet,i ftioc tdhies ceoxuprlsoer aitsi ovne royf microservices are repeated. Further, also the tests that its practical use in different scenarios, organizational +were created in the success definition phase are aspects, and questions regarding the technical performed. Therefore, this activity gives the most realization. An important facet of the latter is the comprehensive assessment of the developed system facilitation of the corresponding quality assurance, and covers all aspects that have been deemed relevant since the quality of the provided solutions is highly by the developers. If there are any issues occurring, important when striving to maximize the benefits the process is continued from the test creation for the offered by the use of BD. One rather recent service that is identified as the cause, following the proposition in that regard is the application of TDD, same logic as in the previous step. +However, when the final testing procedure is bwahsielde otnh emrei criso sgeruvidicaensc, ei no tnh e tBheD rdeoamlizaainti.o Hn oowfe vBeDr, successfully concluded, the delivery as the fifth phase projects through the BDSEP, it is not suited for TDD +can follow. Similar to the project planning, it can be and, to our knowledge, there was also no other carried over from the BDSEP as it is, since it is not comparable process model that is. Yet, to reduce majorly affected by the TDD approach. Therefore, it +is, again, a closely monitored staged process (Chen et (rseismeailracrhleyr tso atnhde BpDraScEtitPio) ntheers c oinm preleaxliiztiyn,g a ntdh esiur popwornt al. 2015; Mobus and Kalton 2015). In case of +identified problems, the process should be traversed tceosrtr esdproivnedni ngB pDr oceensds emavooduerls ,t hatht eh elcprse attoi osnt ruocftu rae again from the system design activity, since errors the necessary activities appears to be desirable. To +during the implementation would have been likely +identified through the created tests, which hints bexripdlgoer edth ihs ogwa pt,h ien BthDeS pEuPb lcicaant iobne aatd ahpatnedd, itto wthaes towards an issue with the design. application of TDD. Thereby, the BDSEP was taken +Finally, the five main phases of the TDBDSEP are as a foundation that was then modified to reflect the followed by the system’s actual operation. This specificities of the TDD approach, resulting in the includes, besides the productive utilization, again, the +necessary maintenance as well as the TDBWDhSilEeP asso tmhies waosrpke’sc tcso ntrreimbuatiinoend. the same, decommissioning. However, this time, the former is compared to the BDSEP, the strong connection +facilitated by the strong modularization and the +availability of comprehensive tests, which makes it bchetawngeeens rtehgea rddiensgig tnh ea pnrdo cteesssti’n pgh aaslseos alnedd atcot ivmitaijeosr. easier to modify or replace elements without risking It now comprises five phases, namely project +the introduction of new issues. +An illustration of the TDBDSEP to facilitate the ptelsatninngin, ga,n sdu dcecleisvse rdye,f iwnhitiicohn ,a dree sfioglnlo, wdeevde bloyp tmhee natc atunadl comprehensibility of its structure and contents is operation. Even though the proposed process is +depicted in Figure 2. +Even though the described process is rather gheande troa blley mcoamdep sreohmeen sciovme,p froorm thisee ssa tkhea ot lfe calda rtioty c,e trhtearine comprehensive, some aspects have been simplified to +increase clarity and readability. While it is generally lbiemloitnagtiionngs .t oD essepvietera tlh (ev pirotussailb) ilcitoym opfo an emntisc roats eorvniccee, possible for a microservice to be assigned to multiple +components, as it was stated in the beginning of this tchoims pilsi cantoint gr efilte ctfeodr inth eth e redaedsecrr ipatinodn , tthoe raevfooride section, the prior descriptions assume that each hampering its application and dissemination. Yet, in +service is part of only one component. In situations situations where this option becomes relevant, it must where this is not the case, corresponding be accounted for by the TDBDSEP’s applicants. modifications to the process have to be factored in. Further, while it is generally possible and oftentimes The same applies to the fact that the process describes advisable to conduct the implementation of the a setting in which the development is conducted in a separate microservices in a parallelized fashion linear fashion, whereas in reality, a parallelization through multiple teams, for the TDBDSEP, this is during the development and testing phase is not only also simplified to a linear sequence of singular feasible, but possibly also advisable. activities, making it easier for the reader to follow. +With respect to future research, there are two Data and Security, Prague, Czech Republic. 07.05.2020 main avenues that should be pursued. The first one is - 09.05.2020, SCITEPRESS - Science and Technology to further explore and outline the details of the Publications, pp. 249-256 (doi: 10.5220/00093886024 90256). +dapespclircibaendts p whaitshe sa adnddit aiocntiavli tiinessi, gphrtosv iodni nhgo pwro tsop eschtaivpee Fucci, D., Erdogmus, H., Turhan, B., Oivo, M., and Juristo, +N. (2017). “A Dissection of the Test-Driven +their projects to obtain the best possible results. Development Process: Does It Really Matter to Test- Moreover, the TDBDSEP should be evaluated in and First or to Test-Last?” IEEE Transactions on Software possibly refined through the application in varying Engineering (43:7), pp. 597-614 (doi: settings and domains, amending the theoretical 10.1109/tse.2016.2616877). +considerations with ancillary inputs from practice. Gandomi, A., and Haider, M. (2015). “Beyond the hype: +Big data concepts, methods, and analytics,” +International Journal of Information Management REFERENCES (35:2), pp. 137-144 (doi: 10.1016/j.ijinfomgt.2014. +10.007). +Gani, A., Siddiqa, A., Shamshirband, S., and Hanum, F. Alharthi, A., Krotov, V., and Bowman, M. (2017). (2016). “A survey on indexing techniques for big data: “Addressing barriers to big data,” Business Horizons taxonomy and performance evaluation,” Knowledge (60:3), pp. 285-292 (doi: and Information Systems (46:2), pp. 241-284 (doi: +10.1016/j.bushor.2017.01.002). 10.1007/s10115-015-0830-y). +Altarturi, H. H., Ng, K.-Y., Ninggal, M. I. H., Nazri, A. S. Gao, J., Xie, C., and Tao, C. (2016). “Big Data Validation +A., and Ghani, A. A. A. (2017). “A requirement and Quality Assurance -- Issuses, Challenges, and +engineering model for big data software,” in Needs,” in Proceedings of the 2016 IEEE Symposium +Proceedings of the IEEE 2017 Conference on Big Data on Service-Oriented System Engineering (SOSE), +and Analytics (ICBDA), Kuching, Malaysia. Oxford, United Kingdom. 29.03.2016 - 02.04.2016, +16.11.2017 - 17.11.2017, pp. 111-117 (doi: IEEE, pp. 433-441 (doi: 10.1109/SOSE.2016.63). +10.1109/ICBDAA.2017.8284116). Ghasemaghaei, M., and Calic, G. (2020). “Assessing the Ataei, P., and Litchfield, A. (2020). “Big Data Reference impact of big data on firm innovation performance: Big +Architectures, a systematic literature review,” in data is not always better data,” Journal of Business +Australasian Conference on Information Systems Research (108:2), pp. 147-162 (doi: +(ACIS) 2020, Wellington, New Zealand, AIS. 10.1016/j.jbusres.2019.09.062). +Beck, K. (2015). Test-Driven Development: By Example, Günther, W. A., Rezazade Mehrizi, M. H., Huysman, M., +Boston: Addison-Wesley. and Feldberg, F. (2017). “Debating big data: A Chang, W. L., and Grady, N. (2019). “NIST Big Data literature review on realizing value from big data,” The +Interoperability Framework: Volume 1, Definitions,” Journal of Strategic Information Systems (26:3), pp. +Special Publication (NIST SP), Gaithersburg, MD: 191-209 (doi: 10.1016/j.jsis.2017.07.003). +National Institute of Standards and Technology. Hazen, B. T., Boone, C. A., Ezell, J. D., and Jones-Farmer, Chen, H.-M., Kazman, R., Haziyev, S., and Hrytsay, O. L. A. (2014). “Data quality for data science, predictive +(2015). “Big Data System Development: An Embedded analytics, and big data in supply chain management: An +Case Study with a Global Outsourcing Firm,” in First introduction to the problem and suggestions for +International Workshop on Big Data Software research and applications,” International Journal of +Engineering - BIGDSE 2015, IEEE, pp. 44-50 (doi: Production Economics (154), pp. 72-80 (doi: +10.1109/BIGDSE.2015.15). 10.1016/j.ijpe.2014.04.018). +Crispin, L. (2006). “Driving Software Quality: How Test- Janssen, M., van der Voort, H., and Wahyudi, A. (2017). +Driven Development Impacts Software Quality,” IEEE “Factors influencing big data decision-making quality,” +Software (23:6), pp. 70-71 (doi: 10.1109/MS.2006.157). Journal of Business Research (70:3), pp. 338-345 (doi: Diebold, F. X. (2012). “On the Origin(s) and Development 10.1016/j.jbusres.2016.08.007). +of the Term 'Big Data',” SSRN Electronic Journal (doi: Janzen, D., and Saiedian, H. (2005). “Test-driven +10.2139/ssrn.2152421). development concepts, taxonomy, and future direction,” Faitelson, D., Heinrich, R., and Tyszberowicz, S. (2018). Computer (38:9), pp. 43-50 (doi: 10.1109/MC.2005. +“Functional Decomposition for Software Architecture 314). +Evolution,” in Model-Driven Engineering and Software Janzen, D., and Saiedian, H. (2008). “Does Test-Driven Development, L. F. Pires, S. Hammoudi and B. Selic Development Really Improve Software Design (eds.), Cham: Springer International Publishing, pp. Quality?” IEEE Software (25:2), pp. 77-84 (doi: 377-400 (doi: 10.1007/978-3-319-94764-8_16). 10.1109/MS.2008.34). +Freymann, A., Maier, F., Schaefer, K., and Böhnel, T. Ji, S., Li, Q., Cao, W., Zhang, P., and Muccini, H. (2020). +(2020). “Tackling the Six Fundamental Challenges of “Quality Assurance Technologies of Big Data Big Data in Research Projects by Utilizing a Scalable Applications: A Systematic Literature Review,” and Modular Architecture,” in Proceedings of the 5th Applied Sciences (10:22), p. 8052 (doi: International Conference on Internet of Things, Big 10.3390/app10228052). +Karlesky, M., Williams, G., Bereza, W., and Fletcher, M. Development in Large Projects,” IT Professional (8:5), +(2007). “Mocking the Embedded World: Test-Driven pp. 25-29 (doi: 10.1109/MITP.2006.122). +Development, Continuous Integration, and Design Shahin, M., Ali Babar, M., and Zhu, L. (2017). “Continuous +Patterns,” in Embedded Systems Conference, San Jose, Integration, Delivery and Deployment: A Systematic +California, USA. 01.04.2007 - 05.04.2007, UBM Review on Approaches, Tools, Challenges and +Electronics. Practices,” IEEE Access (5), pp. 3909-3943 (doi: Katal, A., Wazid, M., and Goudar, R. H. (2013). “Big data: 10.1109/ACCESS.2017.2685629). +Issues, challenges, tools and Good practices,” in Sixth Shakir, A., Staegemann, D., Volk, M., Jamous, N., and +International Conference on Contemporary Computing, Turowski, K. (2021). “Towards a Concept for Building +Parashar (ed.), Noida, India. 08.08.2013 - 10.08.2013, a Big Data Architecture with Microservices,” in +IEEE, pp. 404-409 (doi: 10.1109/IC3.2013.6612229). Proceedings of the 24th International Conference on Krylovskiy, A., Jahn, M., and Patti, E. (2015). “Designing Business Information Systems, Hannover, +a Smart City Internet of Things Platform with Germany/virtual. 14.06.2021 - 17.06.2021, pp. 83-94 +Microservice Architecture,” in 2015 3rd International (doi: 10.52825/bis.v1i.67). +Conference on Future Internet of Things and Cloud Shull, F., Melnik, G., Turhan, B., Layman, L., Diep, M., (FiCloud 2015), I. Awan (ed.), Rome, Italy. 24.08.2015 and Erdogmus, H. (2010). “What Do We Know about +- 26.08.2015, Piscataway, NJ: IEEE, pp. 25-30 (doi: Test-Driven Development?” IEEE Software (27:6), pp. 10.1109/FiCloud.2015.55). 16-19 (doi: 10.1109/MS.2010.152). +Kum, W., and Law, A. (2006). “Learning Effective Test Staegemann, D., Volk, M., Daase, C., and Turowski, K. +Driven Development - Software Development Projects (2020a). “Discussing Relations Between Dynamic +in an Energy Company,” in Proceedings of the First Business Environments and Big Data Analytics,” +International Conference on Software and Data Complex Systems Informatics and Modeling Quarterly +Technologies, Setúbal, Portugal. 11.09.2006 - (23), pp. 58-82 (doi: 10.7250/csimq.2020-23.05). +14.09.2006, SciTePress - Science and and Technology Staegemann, D., Volk, M., Jamous, N., and Turowski, K. +Publications, pp. 159-164 (doi: 10.5220/00013161015 (2019a). “Understanding Issues in Big Data +90164). Applications - A Multidimensional Endeavor,” in Lehmann, D., Fekete, D., and Vossen, G. (2016). Proceedings of the Twenty-fifth Americas Conference +“Technology selection for big data and analytical on Information Systems, Cancun, Mexico. 15.08.2019 - +applications,” Working Papers, ERCIS - European 17.08.2019. +Research Center for Information Systems 27, Münster. Staegemann, D., Volk, M., Jamous, N., and Turowski, K. Levin, I., and Mamlok, D. (2021). “Culture and Society in (2020b). “Exploring the Applicability of Test Driven +the Digital Age,” Information (12:2), p. 68 (doi: Development in the Big Data Domain,” in Proceedings +10.3390/info12020068). of the ACIS 2020, Wellington, New Zealand. Mobus, G. E., and Kalton, M. C. (2015). Principles of 01.12.2020 - 04.12.2020. +Systems Science, New York, NY: Springer. Staegemann, D., Volk, M., Lautenschlager, E., Pohl, M., Müller, O., Fay, M., and Vom Brocke, J. (2018). “The Abdallah, M., and Turowski, K. (2021a). “Applying +Effect of Big Data and Analytics on Firm Performance: Test Driven Development in the Big Data Domain – +An Econometric Analysis Considering Industry Lessons From the Literature,” in 2021 International +Characteristics,” Journal of management information Conference on Information Technology (ICIT), Amman, +systems (35:2), pp. 488-509 (doi: 10.1080/07421222. Jordan. 14.07.2021 - 15.07.2021, IEEE, pp. 511-516 +2018.1451955). (doi: 10.1109/ICIT52682.2021.9491728). Nadareishvili, I., Mitra, R., McLarty, M., and Amundsen, Staegemann, D., Volk, M., Nahhas, A., Abdallah, M., and +M. (2016). Microservice architecture: Aligning Turowski, K. (2019b). “Exploring the Specificities and principles, practices, and culture, Beijing, Boston, Challenges of Testing Big Data Systems,” in Farnham, Sebastopol, Tokyo: O´Reilly. Proceedings of the 15th International Conference on +Pääkkönen, P., and Pakkala, D. (2015). “Reference Signal Image Technology & Internet based Systems, +Architecture and Classification of Technologies, Sorrento. +Products and Services for Big Data Systems,” Big Data Staegemann, D., Volk, M., and Turowski, K. (2021b). +Research (2:4), pp. 166-186 (doi: 10.1016/j.bdr.2015. “Quality Assurance in Big Data Engineering - A +01.001). Metareview,” Complex Systems Informatics and Poleto, T., Heuer de Carvalho, V. D., and Costa, A. P. C. S. Modeling Quarterly (28), pp. 1-14 (doi: +(2017). “The Full Knowledge of Big Data in the 10.7250/csimq.2021-28.01). +Integration of Inter-Organizational Information,” Turck, M., and Obayomi, D. (2019). “The Big Data International Journal of Decision Support System Landscape,” available at http://dfkoz.com/big-data- Technology (9:1), pp. 16-31 (doi: 10.4018/IJDSST.20 landscape/, accessed on Jan 13 2020. +17010102). van der Aalst, W., and Damiani, E. (2015). “Processes Meet Russom, P. (2011). “Big Data Analytics: TDWI Best Big Data: Connecting Data Science with Process +Practices Report Fourth Quarter 2011,” Science,” IEEE Transactions on Services Computing Sangwan, R. S., and Laplante, P. A. (2006). “Test-Driven (8:6), pp. 810-819 (doi: 10.1109/TSC.2015.2493732). +Volk, M., Staegemann, D., Bischoff, D., and Turowski, K. +(2021). “Applying Multi-Criteria Decision-Making for the Selection of Big Data Technologies,” in Proceedings of the Twenty-seventh Americas Conference on Information Systems, Montreal, Canada/Virtual. 09.08.2021 - 13.08.2021. +Volk, M., Staegemann, D., Bosse, S., Häusler, R., and Turowski, K. (2020a). “Approaching the (Big) Data Science Engineering Process,” in Proceedings of the 5th International Conference on Internet of Things, Big Data and Security, Prague, Czech Republic. 07.05.2020 +- 09.05.2020, SCITEPRESS - Science and Technology Publications, pp. 428-435 (doi: 10.5220/000956980 4280435). +Volk, M., Staegemann, D., Pohl, M., and Turowski, K. +(2019). “Challenging Big Data Engineering: Positioning of Current and Future Development,” in Proceedings of the IoTBDS 2019, SCITEPRESS - Science and Technology Publications, pp. 351-358 (doi: 10.5220/0007748803510358). +Volk, M., Staegemann, D., and Turowski, K. (2020b). “Big +Data,” in Handbuch Digitale Wirtschaft, T. Kollmann +(ed.), Wiesbaden: Springer Fachmedien Wiesbaden, pp. +1-18 (doi: 10.1007/978-3-658-17345-6_71-1). +Williams, L., Maximilien, E. M., and Vouk, M. (2003). +“Test-driven development as a defect-reduction +practice,” in Proceedings of the 14th ISSRE, Denver, +Colorado, USA. 17.11.2003 - 20.11.2003, IEEE, pp. +34-45 (doi: 10.1109/ISSRE.2003.1251029). +Wu, X., Zhu, X., Wu, G.-Q., and Ding, W. (2014). “Data +mining with big data,” IEEE Transactions on Knowledge and Data Engineering (26:1), pp. 97-107 (doi: 10.1109/TKDE.2013.109). +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +131 diff --git a/docs_to_import/rsl_oliveira2024/3 - Big_Data_Testing_Framework_for_Recommendation_Systems_in_e-Science_and_e-Commerce_Domains.txt b/docs_to_import/rsl_oliveira2024/3 - Big_Data_Testing_Framework_for_Recommendation_Systems_in_e-Science_and_e-Commerce_Domains.txt new file mode 100644 index 0000000..14c5ccb Binary files /dev/null and b/docs_to_import/rsl_oliveira2024/3 - Big_Data_Testing_Framework_for_Recommendation_Systems_in_e-Science_and_e-Commerce_Domains.txt differ diff --git a/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt b/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt new file mode 100644 index 0000000..e30d3a4 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/37-A Process Model for Test Driven Development in the Big Data.txt @@ -0,0 +1,197 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +A Process Model for Test Driven Development in the Big Data Domain +Daniel Staegemann https://orcid.org/0000-0001-9957-1003 +, Matthias Volk https://orcid.org/0000-0002-4835-919X +109 +Staegemann, D., Volk, M., Jamous, N. and Turowski, K. +A Process Model for Test Driven Development in the Big Data Domain. +DOI: 10.5220/0011337200003335 +In Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - Volume 3: KMIS , pages 109-118 ISBN: 978-989-758-614-9; ISSN: 2184-3228 +Copyright c 2022 by SCITEPRESS – Science and Technology Publications, Lda. All rights reserved +, Naoum Jamous and Klaus Turowski +Magdeburg Research and Competence Cluster VLBA, Otto-von-Guericke University Magdeburg, Magdeburg, Germany +Keywords: Big Data, Test Driven Development, TDD, Process Model, Design Science Research, DSR, Microservice. Abstract: Big data has emerged to be one of the driving factors of today’s society. However, the quality assurance of +the corresponding applications is still far from being mature. Therefore, further work in this field is needed. This includes the improvement of existing approaches and strategies as well as the exploration of new ones. One rather recent proposition was the application of test driven development to the implementation of big data systems. Since their quality is of critical importance to achieve good results and the application of test driven development has been found to increase the developed product’s quality, this suggestion appears promising. However, there is a need for a structured approach to outline how the corresponding endeavors should be realized. Therefore, the publication at hand applies the design science research methodology to bridge this gap by proposing a process model for test driven development in the big data domain. +1 INTRODUCTION rather recent proposition was the application of test +driven development (TDD) to the implementation of Today’s society has developed to be heavily driven by BD systems (Staegemann et al. 2020). +When done correctly, this could solve several kMnaomwlloekd ge2, 0i2n1fo).r maCtoionns eaqnude nttelych, nobliogg y d(aLtae vin(B aDn)d, issues at once. Not only would the quality and +respectively big data analytics (BDA) have gained flexibility of the developed applications be increased, huge popularity among organizations that want to but possibly also the trust of the users, which is crucial profit from this rather new resource. Furthermore, to assure the frequent and genuine incorporation into those who do incorporate BDA into their processes the decision processes (Günther et al. 2017). However, experience (on average) a significant increase in so far, there has been no structured approach productivity (Müller et al. 2018), further justifying the formulated how the corresponding endeavors should positive sentiment. Yet, this only does apply to proper be realized. To bridge this gap, the following research use, which is, however, not always a given, since it is question (RQ) shall be answered: +a highly challenging endeavor (Volk et al. 2019). The +arguably most common issues in this regard are a low RQ: How can the process of applying test driven input data quality (Abdallah et al. 2022; Staegemann development in the big data domain be structured? +et al. 2021b), human error or bias in the use of the +applications, and erroneous implementations of the To answer the RQ, the publication at hand is respective systems (Staegemann et al. 2019). structured as follows. After the introduction, the +For the publication at hand, the focus is on the background is briefly delineated. This is followed by latter. While there have been numerous works to an overview of the applied methodology. Afterwards, facilitate the testing of BD applications, it is still a in the main part, a process model for TDD in the BD rather immature topic (Staegemann et al. 2021c). domain is developed, which is also this work’s main Therefore, further work in this field is needed. This contribution. Subsequently, the model is further includes the refinement of existing approaches and discussed and avenues for future research are outlined. strategies as well as the exploration of new ones. One Finally, a conclusion is given. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +A Process Model for Test Driven Development in the Big Data Domain +2 BACKGROUND heterogeneous (Freymann et al. 2020). This, inter +alia, refers to the utilized programming languages and To establish a solid foundation and a common technology stacks. Moreover, their properties allow understanding for the further explanations, in the an independent deployment and usage. For this following, the most important terms and concepts are purpose, usually continuous deployment tools and briefly introduced. pipelines are used, allowing for the automation of the +procedure. +2.1 Big Data Even though in software engineering componentization is generally considered a good The amount of data that is being produced, captured, practice, achieving a high degree of modularity is and analyzed as a result of today’s society’s often seen as challenging task (Faitelson et al. 2018). digitization has been and is still rapidly growing However, when using microservices, this is achieved (Dobre and Xhafa 2014; Statista 2021; Yin and by design. This also reduces the effort for maintenance and the implementation of modifications, since it is +Kdeamynaankd s2 01f5o)r. Ciotns curprreonctleys,s iintsg comalpsole xitiyn carnedas tehde. often sufficient to only redeploy the affected service Consequently, the systems that were previously used when incorporating changes. As a result, through the +for this purpose are oftentimes no longer sufficient use of microservices, an evolutionary design, which is (Chang and Grady 2019). Therefore, new tools and driven by frequent and controlled changes, is techniques are needed to deal with the new promoted (Krylovskiy et al. 2015). +requirements and simultaneously the term big data +emerged to describe this phenomenon. Even though 2.3 Test Driven Development +the origins of a term are not conclusively clarified +(Diebold 2012) and there is also no unified definition TDD is generally seen as a development approach for it (Al-Mekhlal and Khwaja 2019; Volk et al. that (for the cost of a reduced speed) is feasible to 2020b), most of the relevant literature follows a improve an implementation’s quality (Staegemann et similar understanding. The arguably most influential al. 2021a). The corresponding advantages are description (Chang and Grady 2019) is based on four twofold. On the one hand, the test coverage is characteristics, which are sometimes also termed the increased. This helps to detect errors (early) and 4 Vs of big data. Those are volume (number and/or prevents that they affect the productive users. On the size of data entries), velocity (speed of data ingestion other hand, the system’s design is also influenced, and/or required processing speed), variety (diversity since a major part of TDD is its decomposition into of data and content), and variability (changes in the the smallest reasonable pieces. This reduced other characteristics over time). Due to the complexity also helps to avoid errors and increases widespread need for high quality decision making, maintainability (Crispin 2006; Shull et al. 2010). BDA is used in numerous domains, such as Even though the primary application area of TDD, manufacturing (Nagorny et al. 2017), management and also the one that is relevant for the remainder of support (Staegemann et al. 2022a), fashion (Silva et this paper, is in software development, it is also used al. 2019), education (Häusler et al. 2020), sports in other contexts, such as process modelling (Slaats et (Goes et al. 2020), agriculture (Bronson and Knezevic al. 2018) or ontology development (Davies et al. 2016), or healthcare (Bahri et al. 2019). 2019; Keet and Ławrynowicz 2016). +In the traditional software development approach, +2.2 Microservices new features are at first envisioned, then implemented and finally tested. However, in TDD, this order is changed. While the first step remains the same, the +Tdehceo mgepnoesrea l aind eean ovfi sitohne emd icarpopsleicrvatiicoen coinntcoe pste vise rtaol identified functionality is broken down into small smaller services that then interact with each other to parts (Fucci et al. 2017). In the following, tests for +accomplish the given task (Nadareishvili et al. 2016). those parts are written. To assure that they indeed test new aspects, they are run and should, for a lack of the +Ufusnucatliloyn,a litthye. Thsiesr,v iinc etus rn,a arell owbsa siet dto boenn efibtu fsrionmes as actual implementation, fail (Beck 2015). If they high degree of specialization. The microservices all don’t, they need to be reworked due to the premise. +After the tests failed, the productive coding takes raumno inng t heeaicr ho wotnh eprr,o ocenslyse lsi gahntdw feoirg thhte m coemchmanuinsimcast iaorne place, resulting in the desired functionality. The main +utilized. Due to their independent nature, the focus here is just to make it work. In turn, other particular services implementation can be aspects, like the elegance of the code, are not +important, as long as the previously written tests are homogenous toolset, but can instead rely on the passed (Crispin 2006). If this is the case, the code is technology set they deem the most suitable for the then refactored to improve the readability, its given task, due to the independence of the services adherence to standards, best practices, and from each other. In another context, TDD also conventions and to improve its overall quality (Beck increases the flexibility. The created tests allow for 2015). While doing so, the previously written tests are easier and safer changes to the developed application utilized as a safety net to make sure that no errors are because they can be immediately validated through introduced during this procedure. As mentioned the existing tests, leading to faster feedback, the earlier, this focus on incremental modifications and avoidance of newly introduced errors and small tasks (Williams et al. 2003) does not only affect consequently more trust by the users. However, even the coverage, but also the design of the developed though the general idea of applying TDD in the BD solution. Moreover, developers are provided with domain seems promising and there are already some more immediate feedback, due to the shorter test works in the domain (Staegemann et al. 2022b), to cycles (Janzen and Saiedian 2005). While unit tests facilitate its diffusion and make its application more are usually the backbone of TDD, they can (and accessible, it is still necessary to develop further should) also be amended by other types of tests, such corresponding patterns, frameworks, process models, as system, tests, or integration tests (Sangwan and best practices, and approaches to provide developers Laplante 2006). Hereby, especially the latter can be with a solid foundation they can lean on for their seen as essential (Kum and Law 2006). Furthermore, projects, instead of having to determine all steps (and to make sure the necessary test frequency can be their order) on their own. +achieved without the developers having to +cumbersomely deal with it manually, TDD is often +combined with a continuous integration (CI) pipeline 3 METHODOLOGY +to enable test automation (Karlesky et al. 2007; +Shahin et al. 2017). Consequently, whenever a In order to assure scientific rigor while answering the change is committed, a CI server runs the existing RQ, the design science research (DSR) approach tests, checking if the last change has introduced any (Hevner et al. 2004) is applied. This constructive new errors that need to be fixed. methodology is geared towards the development and +2.4 Test Driven Development in Big evaluation of artifacts in the information systems research domain. The purpose of those is to solve +Data organizational problems. They can be “constructs (vocabulary and symbols), models (abstractions and +As it was already described earlier, applying TDD is representations), methods (algorithms and practices), a promising new approach for the engineering of and instantiations (implemented and prototype high-quality BD applications. For this purpose, the systems)” (Hevner et al. 2004). To further enhance use of microservices as a technical foundation has the comprehensibility, the workflow of the design been proposed (Staegemann et al. 2020). Since a science research methodology (DSRM) presented in major component of TDD is to break down the (Peffers et al. 2007) is followed. The DSRM desired application into small parts and microservices decomposes the DSR into a sequence of six steps, facilitate exactly this architectural concept, there is a which are depicted in Figure 1. +huge synergy that can be exploited (Shakir et al. The DSRM begins with the problem 2021). Their use allows to realize each business identification and motivation, which are outlined in functionality as a separate service, which also gives the beginning of the next section. In the second the option for independent scaling, depending on the activity, the researcher shall define the objectives for respective workloads. Further, this also impacts the a solution. This will also be part of the same implementation process, since the development of the subsection. The third step, design and development, respective services can be distributed across different will be discussed in the succeeding subsection, teams. Additionally, those don’t have to use a resulting in the construction of the DSR artifact as the + +Figure 1: Process Sequence of the DSRM According to (Peffers et al. 2007). +main contribution of the publication at hand. facilitate the use of TDD in the BD domain to increase Furthermore, the underlying explanations will serve the overall quality of the developed solutions. as an implicit, preliminary evaluation, which Furthermore, this process should be easy and corresponds to activity five. The final activity, unambiguous to follow, which on the one hand refers communication, is performed through the publication to the outlined sequence of steps, but on the other hand at hand. However, due to the artifact being a process also on the utilized notation. +model, whose phases need to be filled with concrete +activities (which is out of this work’s scope) for its 4.2 Development of the Artifact +actual implementation, the demonstration will be +deferred to the future. Since this work builds upon the MBTDD-BD +proposition (Staegemann et al. 2020), it will also +follow the general structure, which results in the 4 THE PROCESS MODEL existence of several levels (system, component, +subcomponent/ microservice, method). Furthermore, In the following, using the DSRM by Peffers et al. the wording is adopted, increasing the (2007), a process model is proposed, facilitating the comprehensibility. Moreover, even though in the application of TDD in the BD domain through the following only tests are explicitly mentioned, as provisioning of a structured approach that supports suggested in the MBTDD-BD, benchmarks can also developers in implementing their respective BD be added alongside them to introduce another endeavors in a test driven manner. dimension of quality assurance. However, the main +focus is on the functional testing. +4.1 Motivation To start the process, it is at first necessary to know the requirements for the system that shall be +When applying the DSRM, the first activity is to developed (ISO 2018; Sommerville 2007). However, identify the problem that shall be solved, and to in the context of this work, outlining their gathering motivate, why this should be done. In the case at hand, would be out of scope. Therefore, the list of it was already outlined why big data is of great requirements is considered as an available input. significance for today’s society. Further, the Based on those, concrete features of the system can be derived. While it is not yet determined how they will +iamndp oirtt awncaes odfi spcruospseerd qhuoalwit yt haes suarpapnlcicea twioans oouf tlTinDedD, be implemented, this step turns the identified needs might help in the implementation of the corresponding into high level tasks and is therefore a prerequisite for +the actual realization. In the TDD methodology, after spyrostceemdus.r e Hfoorw theivse hr,a st on oto yuer t bkeneonw floerdmgea,l izaend . Wacthuialel determining what is to be implemented, the +it is necessary to maintain a certain degree of freedom corresponding tests shall be written. Accordingly, the to reflect the individual nature of such projects, this next step is to define the tests for the system as a also constitutes both, a barrier for entry, as well as a whole. Those might be automated, manual, or a hybrid potential source for errors and inefficiencies. Since the approach and are supposed to show if it provides the desired functionality. Implementing the system tests at +pbraospedo seTdD cDo nicne ptth efo rb itgh ed aaptap lidcoamtioanin o f( MmBicTroDsDer-vBicDe-) such an early stage on the one hand corresponds with the TDD philosophy, and on the other hand potentially +cnounmtabienrs osfe vaecrtaivl ilteivese lrse aqnudir teydp efos ro fit tse simts,p tlheemree nista ati boing. also brings practical advantages. This step, as the Developers that don’t have extensive experience with previous one, immensely benefits from having domain +knowledge and a comprehensive overview of the TnuDmDb einr tohfe BdiDff edroemnta ipno mssiigbhlet boer ddeertes rroefd tbhyo sthee (hwuigthe product’s business side, respectively the purpose it is +developed for. Therefore, the process should heavily wrersounltgs ),d aesc wisieolln as s ltehaed tihnrge att oo f eoxvterarl owokoirnkg iomr pworotarsnet involve experts or potential users from that domain. +activities, which would reduce the effectiveness of the Meanwhile the further steps are of rather technical nature and do not need that much comprehensive +athpapnr oathceh . Striandcieti TonDaDl iasp upsruoaalclyh m(oSrtea etgimeme acnonn suemt inalg. knowledge of all usage related aspects of the product. 2021a), this additional effort can only be justified if By creating the system tests early, it is possible to +focus the involvement of the needed knowledge tThhee rceofrorrees,p iot nisd ninegc ebsseanreyf ittos pcraonv idaec tdueavlleyl obpee rsr ewaiptehd a. carriers on the starting phase, which allows them to +structured procedure to reduce this uncertainty, focus on their day to day tasks afterwards, while the eliminate potential sources of error and, hereby, technical experts take over from then. (Even though +some involvement of distinct business experts/users next. Further, in succession, there is also a change might still be needed for some decisions that might from the component level to the subcomponent level. arise later.) Once the system tests have been created, There, analogous to the previous levels, at first, tests the implementation can be progressed. For this for the unit (in this case the microservice) as a whole purpose, the previously identified features are are written, allowing to later on confirm that the translated into distinct microservices, which envisioned capabilities have actually been inherently also determines the system’s architecture. successfully realized. When the creation of those tests Further, not only the services and their functionality is assigned to a team that is different from the one that are defined, but also their interfaces. The result of this is responsible for the implementation, this can also act step is an overview of the required microservices as as an additional safety net by adding another well as their interconnections. However, the concrete perspective on potential issues and edge cases. This implementation of the services is not yet designed. In also constitutes a deviation from the proposition the following, those microservices, which are also expressed in the original MBTDD-BD paper called subcomponents in the MBTDD-BD, are (Staegemann et al. 2020), since there, the assurance of grouped to components. A component constitutes a the functionality of the microservice as a whole was contentual unit that is deemed belonging together by described as only being implemented indirectly, the developers, respectively architect. Those could for through the tests within the developed service. example be the loading of data that consists of several Explicit tests were not intended. However, since the services that are each specialized to provide data from inclusion of such tests for the entire service allows to one specific (type of) source or the preprocessing that incorporate a view on the slightly bigger picture, comprises multiple steps that are each realized as a which is not necessarily given on the method level, separate microservice. However, there are no fixed their integration reduces the risk of overlooking issues rules, instead the definition of components is subject that are not as apparent when only operating on the to the individual assessment of the decision makers. method level. +Moreover, depending on the context, components can The creation of the tests for the microservice as a also overlap (e.g. a microservice can belong to several whole is followed by the test driven implementation components), or just comprise a single subcomponent, of that service, as it is described in the related in case it is rather standalone. Yet, for the sake of background section. Therefore, at first, the tests for a coherence, each microservice has to belong to at least function are written, then the functionality is one component. implemented and finally the code is refactored to +Subsequently, to later on assure that not only the increase its quality and readability. This procedure is components itself but also the communication repeated until the entire service is completed. While between them works as intended, corresponding tests the described process as a whole takes place on the have to be created. While all those steps, that happen subcomponent level, the implementation of the on the system level, are only conducted once, the particular functions corresponds to the method level. succeeding activities are performed repeatedly until Once the implementation is finished, the the implementation of all components is finished. At aforementioned tests for the entirety of the first, is has to be chosen, which component shall be subcomponent are run. In case that they do not pass worked on next. The criteria for this decision can be completely, the service goes back to the previous individually determined. Possible reasoning could, for implementation stage, where it is worked on until the example, be based on factors such as the availability issue is deemed resolved. Once the subcomponent of certain experts, the perceived importance or tests pass, the subcomponent level is left, the process complexity, or contentual relations and again enters the component level and the microservice interdependencies. It is also possible that a specific can be integrated into the current iteration of the microservice shall be implemented at this stage (for component. +example based on above mentioned criteria) and However, this is not the final step concerning the therefore the corresponding component is chosen at regarded service. It is possible that a microservice in this stage. After the decision is made, the system level itself is not erroneous and, therefore, the testing is is left and the work on the component level begins. positive, but there are issues with the interplay with If the component has not yet been worked on other services. An example (even though it is not big before, the next step is to create the tests for the data related) that made the news was the NASA component, otherwise this can be skipped, since it has climate orbiter crash from 1999, where one involved already been done in the past. Then it has to be partner used English units and the other metric ones, determined which microservice will be implemented leading to a failed mission, despite both parts in itself + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +113 +A Process Model for Test Driven Development in the Big Data Domain +being functional (NASA 2019). To avoid a similar situation, the integration of the subcomponent needs to be followed by a run of the component tests as well as the relevant tests for the communication. Only if those also pass, the microservice can be deemed finished. Otherwise, the developers have to go back to the development stage. However, in case of success, the component level is left and the system level is entered again. Now, the further procedure depends on the current status of the system’s implementation. If there are still components that are not entirely finished, it has to again be decided, which component should be worked on next. From there, the process continues as already outlined above. +In case every component, and therefore every part of the envisioned system, has been implemented and individually tested with success, a final test run that +comprises all tests (including those for the system as a whole) allows to check for a last time, if everything is working as intended. Should there be any problems, those have to be thoroughly analyzed. Once the source of error is identified, the developers shall fix the underlying issues, using the comprehensive test collection to assure that no new errors are introduced. However, if this last instance of quality assurance is also passed without the occurrence of any problems, the development process is finished and the system can be used productively. +The complete process model is displayed in Figure 2. To give an easy to follow overview of the proposed process model, its graphical depiction is heavily leaning onto the BPMN notation. However, this also introduces some constraints. The levels of the process are depicted as separate BPMN pools. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +A Process Model for Test Driven Development in the Big Data Domain + +Figure 2: Process Model for Test Driven Development in the Big Data Domain. +While this slightly deviates from the idea behind the differ from other development contexts, so that a concept of pools in BPMN, it increases visual clarity specific description is not necessary. +and was therefore implemented. Since the test driven Another aspect that is highly important but not implementation of the microservice is depicted as one directly covered by the process model is the selection step and not further broken down, there are only three of tools and technologies. While the modular nature levels shown, with the method level being omitted. of the MBTDD-BD allows for a high degree of +Furthermore, especially in larger projects, it is flexibility and gives the developers the choice, which likely that several teams work in parallel, whereas the programming languages, frameworks or existing depicted process presents a linear sequence. This is solutions they want to use, respectively incorporate, also for the sake of visual clarity. However, in reality, there is no support provided for those decisions. Since there might be several microservices (also from there is a plethora of available options, this task can, different components) be worked on at the same time. however, also be highly challenging. While there are Yet, this does not crucially affect the actual flow, already existing works that focus on a general wherefore it is only mentioned but not graphically decision support for the technology selection in BD represented. Additionally, the outlined process refers projects (Volk et al. 2020a), additional material that to projects that are created from scratch. If an is geared towards this specific situation might be application that was built according to the proposed helpful for prospective developers and, hence, also procedure shall be modified, the already existing tests help to facilitate the dissemination of TDD in the BD can be utilized. Changes on any other pre-existing domain in general. +systems are out of scope of the proposed process Additionally, as previously mentioned, the model and individual approaches have to be found. proposed model slightly simplifies the development process by presenting it as a sequential flow. While is +reality, several teams might work in parallel on 5 DISCUSSION AND FUTURE several services, the increased comprehensibility was deemed worth it to accept that slight simplification as +WORK a trade-off. When applying the model in a parallel scenario, it is therefore necessary to account for this +With the steady increase of the number of BD decision and adjust the actual workflow accordingly. applications that are being used and their quality Further, the model only outlines which actions assurance being one of the major challenges should be taken in which order, but not by whom. (Staegemann et al. 2019), finding ways to tackle that Even though the specifics of this decision obviously issue is highly important. While the MBTDD-BD heavily depend on the structures of the organizations approach seems generally promising to increase the and teams that are involved, the identification of best quality as well as the modifiability of the developed practices and recommendations could still prove to be systems, up to now, there was no structured procedure valuable support. Therefore, this might be a for its application. The proposed process model is worthwhile task for future researchers that has strong directed towards bridging this gap. By following the practical implications. +comprehensive sequence of steps, the necessary Since the quality of big data applications heavily activities can be covered, while also assuring that the depends on the correct architectural choices (Ataei order is actually sensible and corresponds to the spirit and Litchfield 2020) and there are numerous patterns of the TDD methodology. proposed for the implementation of microservices, it +However, several factors have to be taken into also appears reasonable to regard those two aspects in account. The first aspect is that the requirements for context of each other to determine, which the system are taken for granted. While this makes microservice patterns are best suited to deal with sense for the aspired scope, they are extremely certain challenges of big data development and the important for the success of an implementation underlying big data characteristics. +project. Therefore, it is mandatory to find a suitable +approach for their collection. This also means that the +proposed process model cannot be seen as a panacea 6 CONCLUSION +but has to be used in conjunction with other suitable +methods. To a lesser degree this also applies to the +test driven implementation of the distinct Banigd daaptpal iacnadti otnhse choarvree speomnedrignegd totoo lsb, et eochnne oloofg itehse, microservices not being described in detail. However, driving factors of today’s society. Countless +on this level, the development does not crucially + +Figure 3: The DSR Grid for the Presented Work. +organizations from numerous domains rely on the endeavor in its entirety is given in Figure 3, in the form ability to utilize information to an unprecedented of the DSR Grid (Vom Brocke and Maedche 2019). extent to improve their inherent processes and +decision making, and, thereby, inter alia, reduce their +costs, increase their productivity, strengthen their REFERENCES +marketing, support their maintenance, improve their +logistics, or identify new opportunities. However, the +implementation of those systems is a highly Abd“aTlloawh,a rMds., a HDaamtam Caodll, ecAti.,o na nQdu aAlilt-yZ Myaoddaetl, fWor .B (i2g0 D2a2t)a. challenging and error-prone task, while at the same Applications,” in Business Information Systems +time their quality is crucial for the successful use. Workshops, W. Abramowicz, S. Auer and M. Stróżyna Therefore, their quality assurance is very important. (eds.), Cham: Springer International Publishing, pp. Yet, this domain is still far from being mature. 103-108 (doi: 10.1007/978-3-031-04216-4_11). Therefore, further work in this field is needed. This Al-Mekhlal, M., and Khwaja, A. A. (2019). “A Synthesis includes the improvement of existing approaches and of Big Data Definition and Characteristics,” in strategies as well as the exploration of new ones. One Proceedings of the 2019 IEEE International rather recent proposition was the application of test Conference on Computational Science and Engineering driven development to the implementation of big data (ECmSbEe)d deadn da ndI EUEbEi quIintoteursn aCtioomnpaul tinCgo n(EfeUreCn)c,e Neown +systems. However, it was not outlined how the York, NY, USA. 01.08.2019 - 03.08.2019, IEEE, pp. corresponding process should be designed. 314-322 (doi: 10.1109/CSE/EUC.2019.00067). +The publication at hand bridges this gap and Ataei, P., and Litchfield, A. (2020). “Big Data Reference provides developers that are interested in the Architectures, a systematic literature review,” in application of TDD in the BD domain with a process Australasian Conference on Information Systems model that outlines, which activities should be (ACIS) 2020, Wellington, New Zealand, AIS. performed in which order and, therefore, helps in Bahri, S., Zoghlami, N., Abed, M., and Tavares, J. M. R. S. structuring the implementation process. Thereby, it (A2c0c1e9s)s. “BIG( D7)A, TA foprp H. ealthcare: A Survey,” I(EdEoEi: helps in disseminating the general approach, 10.1109/ACCESS.2018.28891807).3 97-7408 +facilitates its effective utilization, promotes a stronger Beck, K. (2015). Test-Driven Development: By Example, focus on the topic of quality assurance, and can be Boston: Addison-Wesley. +used as a foundation to advance the scientific Bronson, K., and Knezevic, I. (2016). “Big Data in food and discourse in the domain. An overview of the research agriculture,” Big Data & Society (3:1) (doi: +10.1177/2053951716648174). +Chang, W. L., and Grady, N. (2019). “NIST Big Data Hevner, A. R., March, S. T., Park, J., and Ram, S. (2004). +Interoperability Framework: Volume 1, Definitions,” “Design science in information systems research,” MIS Special Publication (NIST SP), Gaithersburg, MD: quarterly, pp. 75-105. +National Institute of Standards and Technology. ISO. (2018). “International Standard ISO / IEC / IEEE Crispin, L. (2006). “Driving Software Quality: How Test- 29148 Systems and Software Engineering — Life +Driven Development Impacts Software Quality,” IEEE Cycle process - Requirements Engineering,” +Software (23:6), pp. 70-71 (doi: 10.1109/MS.2006.157). ISO/IEC/IEEE 29148:2018. +Davies, K., Keet, C. M., and Lawrynowicz, A. (2019). Janzen, D., and Saiedian, H. (2005). “Test-driven +“More Effective Ontology Authoring with Test-Driven development concepts, taxonomy, and future direction,” +Development and the TDDonto2 Tool,” International Computer (38:9), pp. 43-50 (doi: +Journal on Artificial Intelligence Tools (28:7) (doi: 10.1109/MC.2005.314). +10.1142/S0218213019500234). Karlesky, M., Williams, G., Bereza, W., and Fletcher, M. Diebold, F. X. (2012). “On the Origin(s) and Development (2007). “Mocking the Embedded World: Test-Driven +of the Term 'Big Data',” SSRN Electronic Journal (doi: Development, Continuous Integration, and Design +10.2139/ssrn.2152421). Patterns,” in Embedded Systems Conference, San Jose, Dobre, C., and Xhafa, F. (2014). “Intelligent services for California, USA. 01.04.2007 - 05.04.2007, UBM +Big Data science,” Future Generation Computer Electronics. +Systems (37), pp. 267-281 (doi: Keet, C. M., and Ławrynowicz, A. (2016). “Test-Driven +10.1016/j.future.2013.07.014). Development of Ontologies,” in The Semantic Web. Faitelson, D., Heinrich, R., and Tyszberowicz, S. (2018). Latest Advances and New Domains, H. Sack, E. +“Functional Decomposition for Software Architecture Blomqvist, M. d'Aquin, C. Ghidini, S. P. Ponzetto and +Evolution,” in Model-Driven Engineering and Software C. Lange (eds.), Cham: Springer International +Development, L. F. Pires, S. Hammoudi and B. Selic Publishing, pp. 642-657 (doi: 10.1007/978-3-319- +(eds.), Cham: Springer International Publishing, pp. 34129-3_39). +377-400 (doi: 10.1007/978-3-319-94764-8_16). Krylovskiy, A., Jahn, M., and Patti, E. (2015). “Designing Freymann, A., Maier, F., Schaefer, K., and Böhnel, T. a Smart City Internet of Things Platform with +(2020). “Tackling the Six Fundamental Challenges of Microservice Architecture,” in Proceedings of the 2015 +Big Data in Research Projects by Utilizing a Scalable 3rd International Conference on Future Internet of +and Modular Architecture,” in Proceedings of the 5th Things and Cloud (FiCloud 2015), I. Awan (ed.), Rome, +International Conference on Internet of Things, Big Italy. 24.08.2015 - 26.08.2015, Piscataway, NJ: IEEE, +Data and Security, Prague, Czech Republic. 07.05.2020 pp. 25-30 (doi: 10.1109/FiCloud.2015.55). +- 09.05.2020, SCITEPRESS - Science and Technology Kum, W., and Law, A. (2006). “Learning Effective Test Publications, pp. 249-256 (doi: Driven Development - Software Development Projects 10.5220/0009388602490256). in an Energy Company,” in Proceedings of the First +Fucci, D., Erdogmus, H., Turhan, B., Oivo, M., and Juristo, International Conference on Software and Data +N. (2017). “A Dissection of the Test-Driven Technologies, Setúbal, Portugal. 11.09.2006 - Development Process: Does It Really Matter to Test- 14.09.2006, SciTePress - Science and and Technology First or to Test-Last?” IEEE Transactions on Software Publications, pp. 159-164 (doi: Engineering (43:7), pp. 597-614 (doi: 10.5220/0001316101590164). 10.1109/tse.2016.2616877). Levin, I., and Mamlok, D. (2021). “Culture and Society in Goes, F. R., Meerhoff, L. A., Bueno, M. J. O., Rodrigues, the Digital Age,” Information (12:2), p. 68 (doi: +D. M., Moura, F. A., Brink, M. S., Elferink-Gemser, M. 10.3390/info12020068). +T., Knobbe, A. J., Cunha, S. A., Torres, R. S., and Müller, O., Fay, M., and Vom Brocke, J. (2018). “The Lemmink, K. A. P. M. (2020). “Unlocking the potential Effect of Big Data and Analytics on Firm Performance: of big data to support tactical performance analysis in An Econometric Analysis Considering Industry professional soccer: A systematic review,” European Characteristics,” Journal of Management Information journal of sport science, pp. 1-16 (doi: Systems (35:2), pp. 488-509 (doi: 10.1080/17461391.2020.1747552). 10.1080/07421222.2018.1451955). +Günther, W. A., Rezazade Mehrizi, M. H., Huysman, M., Nadareishvili, I., Mitra, R., McLarty, M., and Amundsen, +and Feldberg, F. (2017). “Debating big data: A M. (2016). Microservice architecture: Aligning literature review on realizing value from big data,” The principles, practices, and culture, Beijing, Boston, Journal of Strategic Information Systems (26:3), pp. Farnham, Sebastopol, Tokyo: O´Reilly. +191-209 (doi: 10.1016/j.jsis.2017.07.003). Nagorny, K., Lima-Monteiro, P., Barata, J., and Colombo, Häusler, R., Staegemann, D., Volk, M., Bosse, S., Bekel, C., A. W. (2017). “Big Data Analysis in Smart +and Turowski, K. (2020). “Generating Content- Manufacturing: A Review,” International Journal of +Compliant Training Data in Big Data Education,” in Communications, Network and System Sciences (10:03), +Proceedings of the 12th CSEdu, Prague, Czech pp. 31-58 (doi: 10.4236/ijcns.2017.103003). +Republic. 02.05.2020 - 04.05.2020, SCITEPRESS - NASA. (2019). “Mars Climate Orbiter,” available at Science and Technology Publications, pp. 104-110 https://solarsystem.nasa.gov/missions/mars-climate- (doi: 10.5220/0009513801040110). orbiter/in-depth/, accessed on Feb 27 2022. +Peffers, K., Tuunanen, T., Rothenberger, M. A., and Staegemann, D., Volk, M., Saxena, A., Pohl, M., Nahhas, +Chatterjee, S. (2007). “A Design Science Research A., Häusler, R., Abdallah, M., Bosse, S., Jamous, N., +Methodology for Information Systems Research,” and Turowski, K. (2021b). “Challenges in Data +Journal of Management Information Systems (24:3), pp. Acquisition and Management in Big Data +45-77 (doi: 10.2753/MIS0742-1222240302). Environments,” in Proceedings of the 6th International Sangwan, R. S., and Laplante, P. A. (2006). “Test-Driven Conference on Internet of Things, Big Data and +Development in Large Projects,” IT Professional (8:5), Security, Prague,Czech/Online Streaming. 23.04.2021 - +pp. 25-29 (doi: 10.1109/MITP.2006.122). 25.04.2021, SCITEPRESS - Science and Technology Shahin, M., Ali Babar, M., and Zhu, L. (2017). “Continuous Publications, pp. 193-204 (doi: +Integration, Delivery and Deployment: A Systematic 10.5220/0010429001930204). +Review on Approaches, Tools, Challenges and Staegemann, D., Volk, M., and Turowski, K. (2021c). +Practices,” IEEE Access (5), pp. 3909-3943 (doi: “Quality Assurance in Big Data Engineering - A +10.1109/ACCESS.2017.2685629). Metareview,” Complex Systems Informatics and Shakir, A., Staegemann, D., Volk, M., Jamous, N., and Modeling Quarterly (28), pp. 1-14 (doi: +Turowski, K. (2021). “Towards a Concept for Building 10.7250/csimq.2021-28.01). +a Big Data Architecture with Microservices,” in Staegemann, D., Volk, M., and Turowski, K. (2022b). +Proceedings of the 24th International Conference on “Adapting the (Big) Data Science Engineering Process +Business Information Systems, Hannover, to the Application of Test Driven Development,” in +Germany/virtual. 14.06.2021 - 17.06.2021, pp. 83-94 Proceedings of the 19th International Conference on +(doi: 10.52825/bis.v1i.67). Smart Business Technologies, Lisbon, Portugal. Shull, F., Melnik, G., Turhan, B., Layman, L., Diep, M., 14.07.2022 - 16.07.2022, SCITEPRESS - Science and +and Erdogmus, H. (2010). “What Do We Know about Technology Publications, pp. 120-129 (doi: +Test-Driven Development?” IEEE Software (27:6), pp. 10.5220/0011289200003280). +16-19 (doi: 10.1109/MS.2010.152). Statista. (2021). “Volume of data/information created, Silva, E. S., Hassani, H., and Madsen, D. Ø. (2019). “Big captured, copied, and consumed worldwide from 2010 +Data in fashion: transforming the retail sector,” Journal to 2025,” available at +of Business Strategy (41:4), pp. 21-27 (doi: https://www.statista.com/statistics/ 871513/worldwide- +10.1108/JBS-04-2019-0062). data-created/, accessed on Feb 13 2022. +Slaats, T., Debois, S., and Hildebrandt, T. (2018). “Open to Volk, M., Staegemann, D., Bosse, S., Nahhas, A., and +Change: A Theory for Iterative Test-Driven Modelling,” Turowski, K. (2020a). “Towards a Decision Support in Business Process Management, M. Weske, M. System for Big Data Projects,” in WI2020 Zentrale Montali, I. Weber and J. Vom Brocke (eds.), Cham: Tracks, N. Gronau, M. Heine, K. Poustcchi and H. Springer International Publishing, pp. 31-47 (doi: Krasnova (eds.), GITO Verlag, pp. 357-368 (doi: 10.1007/978-3-319-98648-7_3). 10.30844/wi_2020_c11-volk). +Sommerville, I. (2007). Software Engineering, eighth Volk, M., Staegemann, D., Pohl, M., and Turowski, K. +edition, Addison-Wesley. (2019). “Challenging Big Data Engineering: Staegemann, D., Feuersenger, H., Volk, M., Liedtke, P., Positioning of Current and Future Development,” in +Arndt, H.-K., and Turowski, K. (2022a). “Investigating Proceedings of the 4th International Conference on +the Incorporation of Big Data in Management Internet of Things, Big Data and Security, Heraklion, +Information Systems,” in Business Information Systems Crete, Greece. 02.05.2019 - 04.05.2019, SCITEPRESS +Workshops, W. Abramowicz, S. Auer and M. Stróżyna - Science and Technology Publications, pp. 351-358 +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +119 diff --git a/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt b/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt new file mode 100644 index 0000000..f0d175c --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/41-SYNTHETIC FLIGHT TEST DATA FOR BIG DATA COMPUTING.txt @@ -0,0 +1,127 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +SYNTHETIC FLIGHT TEST DATA FOR BIG DATA +COMPUTING +Bob Baggerman +Avionics Test and Analysis Corp (ATAC) 4540 East Highway 20 +Niceville, FL 32578 +bob.baggerman@avtest.com +ABSTRACT +There is currently quite a bit of development taking place within the DoD flight test range community in “Big Data” computing. A problem plaguing development is a lack of suitable data sets for development and test of software analysis tools. Most actual flight test data has restricted distribution and so isn't available for many developers. Also, it can be difficult to find actual recorded flight test data which have “interesting” properties such as specific flight profiles and events. +Synthesized IRIG 106 Chapter 10 format flight test data solves these problems by providing data files to developers that are very similar to what might be expected from an actual flight test. Synthetic data files are complete and properly formed data files that contain fake but realistic flight test data as if it had been recorded during an actual flight test. The data in these data files is designed to provide interesting test cases for software tool developers to use. +INTRODUCTION +The Department of Defense (DoD) has been pursuing cloud based storage and processing solutions for flight test data. Storing and processing flight test data in the cloud is a fundamentally different kind of processing environment that will require new software tools and techniques to be developed. Development of these new analysis software tools and techniques requires test data that isn’t readily available to developers. Software tools for creating carefully crafted synthesized (i.e. synthetic) data files have been developed to create useful synthetic flight test data sets. +Big Data is typically defined by the three “V”s, volume, velocity, and variability. The volume of data refers to data sets that are too large to be processed and viewed all at once on a single computer. The velocity of data refers to the speed at which data is coming in and must be processed. The variability of data refers to the wide assortment of data sources and formats to consider. Current modern flight test programs certainly strain under volume and velocity constraints. For most DoD flight test programs the bulk of the recorded data is in IRIG 106 Chapter 10 format. +Up until recently flight test data analysis has primarily involved the analysis of single or a small number of recorded flight test data files. There are numerous applications that will read, interpret, and display recorded data from a single flight test. Cloud based computing will allow new, more sophisticated types of analysis to be done. For the first time “big data” kinds of analysis can be performed on a large number of data sets. +Whereas up until now flight test data analysis addressed question of how a system under test performed in the most recent flight test, cloud-based big data analytics (BDA) analytics allow more sophisticated analysis across multiple data set. Below are several examples of types of analytics that could be accomplished in a cloud based BDA environment. +As we consider synthetic data it is important to keep in mind that the System Under Test (SUT) is the Big Data Analytics platform. These synthetic data sets are to support BDA development and software test. +EXAMPLES OF BIG DATA ANALYSIS +Nominal Flight Path Calculation +Consider an instrument approach flown to 32 at China Lake Naval Air Weapons Station (NAWS) airport. This approach is depicted in Figure 1 below. When flying this approach it is important to pass the final approach fix KATIE at or above 4400’. Interesting analysis questions might be “what is the average altitude error and standard deviation over the final approach fix (FAF)” or “what flights were more than 3 Standard Deviations from the correct Altitude at the FAF?” +Synthetic data with the necessary variability can be easily generated to support development of this kind of analysis. + + +Figure 1 - Example flight path for approach +Flight Segments for Analysis +Next consider the need to identify flight paths for various test runs as shown in Figure 2 below. To measure system the performance of an aircraft system under test (for example a targeting system) it is necessary to identify segments of flight test data that demonstrate performance. An interesting analysis question would be “what flight segments were flown on the test range on headings from 180 degrees to 270 degrees between 3000’ and 6000’ feet altitude MSL within a given latitude and longitude box?” The ability to describe flight segments of interest and then find them in a large set of recorded data files allows regression analysis over the evolution of the system. +Carefully crafted synthetic data as shown in Figure 2 supports development of this kind of data search. + + + + + +Figure 2 - Example flight path segments +Flight Segments for EW analysis +Lastly consider the case for Radar Warning Receiver (RWR) testing as shown in Figure 3 below. RWR testing typically involves many test runs over multiple flights. To measure system performance improvements test analysis may be performed for flight test performed over a period of months or years. An interesting analysis questions would be “What flight segments were flown on a particular range between 5/1/2020 and 5/14/2020 where the RWR detected a particular radar threat?” and “What was the Average and Standard Deviation of Detection Range to the Target?” +Synthetic data with the necessary flight paths and simulated radar threat responses can be easily generated to support development of this kind of analysis. + + + + +Figure 3 - Example flight path segments for radar test +Each of these example analysis scenarios described above necessitate sample data to test against. Currently developers lack realistic data set to develop with for two reasons, +1) Most actual flight test data is restricted distribution in some fashion. Most of it is classified at some level but even most unclassified data is at least Controlled Unclassified Information (CUI) with limited distribution. Development teams lack people and facilities with the appropriate access to controlled data. +2) Existing real world data sets lack “interesting” features for developers to test search and analyze algorithms. Most actual flight test data does not present good test cases for software development, test, and validation. +Synthetic flight test data solves these problems by providing data that has unrestricted distribution and is well crafted to provide useful test cases. +TYPES OF SYNTHETIC DATA +In the analysis examples discussed above it is necessary to have very specific data sets to test and validate new analysis software. Because of this synthetic data is synthesized several different ways depending on the purpose of the underlying test. +Contrived Data – This data is unrealistic flight test data but instead presents data types and values useful for testing correct decoding and conversion of IRIG 106 values. For example, a flight data file with ARINC 429 data has recently been created with integer and floating point values. Messages with minimum values, maximum values, specific positive values, specific negative values, and zero values were created to verify correct decoding. +Synthesized Data – This data attempts to mimic realistic flight test data but with very controlled flight conditions. For example, a flight data file with aircraft navigation MIL-STD-1553 data messages derived from an aircraft simulation software program has been created. This flight data file is completely software created but realistically mimics the position, attitude, and speed of an actual test aircraft flying a typical mission on a test range with specific altitude, speed, and heading parameters. +Repurposed Data – This data recasts previously recorded flight data into IRIG 106 format. NASA had a program to record flight data on regional commercial jets. There are data files for about 220,000 over several years. Each flight data file records over 150 different flight parameters useful for including in derived IRIG 106 format data files for big data analytics. +Other data sources for this effort were also considered. The FAA Automatic Dependent Surveillance–Broadcast (ADS-B) as a source for real-time actual flight data was considered but ADS-B is limited in the number of flight parameters available. Flight data from a computer based flight simulator such as X-Plane and Microsoft Flight simulator was considered but these operate in real time and would take a considerable amount of effort for a human to fly a large number of flight scenarios to support all the flight data files necessary for BDA. Lastly there are also some unclassified sources of actual flight test data but the amount of data and efficacy is limited. +SYNTHETIC FLIGHT TEST DATA GENERATION +Various software applications have been written for generating each of the different types of synthetic data described above. In each case there is a source of “truth” data which is then processed to generate IRIG 106 Chapter 10 data files for test. +Contrived Data +Contrived data is not realistic data but instead contains very specific data fields. In the case of contrived data the contents of the resultant Chapter 10 data file are specified in minute detail. +Contrived data is generated from a content definition data file. The content definition data file contents are written by hand in XML format. Although being laborious, usually only a few well- crafted data types and fields are necessary to validate a software data decoder or processor. The IRIG 106 Chapter 10 Programming Handbook (RCC Document 123-16) Appendix P “XML Mapping” provides the information and definition of the data file contents in XML format. +An example of a contrived dataset definition is shown in Figure 4 below. In this example ARINC 429 data messages were defined in various formats including signed and unsigned integer with minimum, maximum, and zero values. +Once an appropriate XML content definition data file has been authored, the XML is converted into a Chapter 10 format data file using the FLIDAS software application from Data Bus Tools GmbH. + +Synthesized Data +In the case of synthesized data the contents of the resultant Chapter 10 data file are derived from pre-calculated aircraft state data. The goal of the pre-calculated aircraft state data is to provide aircraft state that is both realistic, deterministic, and carefully controlled. The Government Off the Shelf (GOTS) BlueMax6 simulation software available from DSIAC is used to pre-calculate realistic simulated flight data based on a provide detailed input scenario file. +BlueMax6 calculates realistic aircraft dynamic state based on an input scenario file. This scenario file describes the desired flight path at a high level of abstraction. The aircraft type and some initial information such as initial position, heading and speed are first specified. Then the flight path is defined as a series of various types of waypoints and maneuvers, eventually ending in a landing maneuver. A portion of an example scenario file is shown in Figure 5. The flight path shown in Figure 2 was generated from a BlueMax6 scenario. +BlueMaxRunTitle A-10 China Lake Echo Range Aircraft A-10A +CallSign FOLK1 +EntityID 0:0:0:0 +ZuluTime 00:00:00.00 +DtedTerrain On +InitialPitch 0 +InitialPositionLL 35.6959:N 117.6915:W InitialAltitudeMSLf 2110 +InitialTrueHeading 154.5 +InitialAirspeedKtas 50 +InitialThroPosition Auto +InitialGearPosition Down +OutputFileName A-10__China_Lake__Echo_Range__ OutputRateSec 0.04 +ManeuverLimits Autopilot AutopilotMaxRoll 45 AutopilotMinPitch -10 AutopilotMaxPitch +25 +CmdAltitudeMSLf 2300 CmdGearPosition 2200 CmdAirspeedMach BestRateOfClimb CmdFlapPosition Auto CmdSegmentEndMode Acquisition CmdFlySegment +WriteMessage Low Pass Takeoff CmdTrueHeading 154.5 CmdGroundRangeNm 2 CmdAltitudeMSLf 2300 CmdThroPosition 300 CmdFlapPosition 0 CmdSlatPosition 0 CmdFlySegment +WriteMessage China Lake Skytop CmdWaypointLL 35.700833:N 117.499167:W CmdWaypointNavMode Direct CmdAltitudeMSLf 6000 +CmdAirspeedKtas 300 +CmdFlySegment +Figure 5 – Example BlueMax6 scenario file. +BlueMax6 generates an output file with calculated values of aircraft state at regular time intervals. For most synthesized data runs a time step of 40 msec (50 Hz) is chosen. BlueMax6 currently has 497 different aircraft state values available for output. Besides aircraft attitude, position, velocities, and accelerations other values such as throttle position, landing position, and others are also output and used in the synthesized flight data file. +To convert BlueMax6 output files to Chapter 10 data files several conversion software programs have been developed. Each software program written is a command line console application written in C++. The current software is targeted for the Windows environment but is sufficiently generic that it could be easily ported to other operating systems such as Linux. The source code for these software programs are readily available from github. +There are two approaches to generating Chaptert 10 files from BlueMax6 data. In the direct conversion approach BlueMax6 data is read and directly converted into a Chapter 10 data file. This data file includes synthesized data in MIL-STD-1553, Pulse Code Modulation (PCM), and ARINC-429 data types. +When video is to be included in the Chapter 10 file a second conversion approach is used. When video is to be generated BlueMax6 data is first read and stored in a SQLite database. A playback application is used to read navigation data from the database, send aircraft position and attitude data to the X-Plane flight simulator application, and for each navigation point perform a screen capture. Each screen capture is then processed by the ffmpeg digital video encoder library and converted into an MPEG Transport Stream (TS) series of video packets. These TS video packets are then stored back in the SQLite database. This process is repeated for each channel of video desired. This process is depicted in Figure 6. +Video generation is currently a very slow process. With current desktop hardware and a software- only encoder it runs at about one-half real time. For this reason video isn’t necessarily generated for synthesized data sets. From a test and software validation standpoint video data is usually of limited utility. +Once BlueMax6 data has been stored in the SQLite database along with optional video it is processed and converted into a Chapter 10 data file. This process is depicted in Figure 7. The conversion software is a simple fixed time slice simulation engine. Data is read periodically from the SQLite database and stored in a state variable matrix, various simulation modules such as those used to generate navigation data use and add to the state variable matrix, and data formatter modules are used to synthesize and write the output Chapter 10 data. + +Figure 6 – Preprocessing and synthetic video generation + +Figure 7 – Synthetic Chapter 10 data file generation +Repurposed Data +In the early 2000’s NASA had a program to record and make generally available flight data from a number of commercial regional jets. Flight data was recorded onboard a single type of regional jet operating in commercial service over a three-year period. NASA makes this data available on their DASHlink website. +The recorded data includes 186 flight parameters. Detailed aircraft dynamics, system performance, and other engineering parameters are included. Data files for over 220,000 flights were recorded and are available. Figure 8 shows a set recorded flight paths. Figure 9 show a set of recorded flight paths in the vicinity of Detroit’s Wayne County airport. +Although the NASA recorded data sets aren’t carefully controlled, the large number of recorded flights flying on regular routes makes this data set useful for testing big data types of analysis. + + + + + + + + + + +Figure 8 – Example of NASA recorded flights across the country + + + + + + + + + + + + + + + + +Figure 9 – Example of NASA recorded flights near Detroit +NASA makes these data files available in Matlab format. A python script was written to convert these Matlab format files into Comma Separated Value (CSV) format files for later processing. After conversion to CSV format, conversion to Chapter 10 format is accomplished in the same manner as conversion from BlueMax6 data previously shown in Figure 6 and Figure 7. +CONCLUSIONS +The DoD move to cloud computing is enabling development of Big Data Analytics capabilities. Development of new software tools and techniques will require large quantities of data and especially data with interesting features. Synthesized flight test data may be the only practical way to provide the quantities and types of data necessary for software development. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt b/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt new file mode 100644 index 0000000..526a3ad --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/45-Big-Data-based-Testing-Characteristics-Challenges.txt @@ -0,0 +1,176 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2021 7th International Symposium on System and Software Reliability (ISSSR) +Big Data-based Testing: Characteristics, Challenges, and Future Directions + Pan Liu Yihao Li +Faculty of Business Information School of Information and Electrical Engineering Shanghai Business School, Shanghai, China Ludong University, Yantai, China +panl008@163.com yihao.li@ldu.edu.cn +Lian Zeng Xuankui Zheng Sihao Huang +Shanghai Business School Shanghai Business School Shanghai Business School +18786201272@163.com 1079737114@qq.com 1160114530@qq.com + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract—With the rise of the applications of the Internet of Things (IoT) in human society, how to ensure the reliability of IoT systems has become a research hotspot. Generally, there are complex interactions between multiple systems in IoT. Therefore, even if a single system can pass rigorous tests, it may not be able to guarantee that the system runs reliably in a complex IoT environment. With the operation of the IoT system, a large amount of data will be generated to record sensor data, system operations, user’s operations, and other information. Therefore, software faults or software design defects can be discovered if we use appropriate big data technology to mine the massive amount of data. The paper states the characteristics of big data-based testing and compares this test method with traditional software test methods in the software life cycle. Then, the paper discusses the challenges of applying big data-based testing to IoT systems. Finally, some future research directions of big data-based testing are given in the paper. +Keywords: big data-based testing; big data technology; system reliability; IoT systems +I. INTRODUCTION +With the advent of the IoT era, more and more large- scale systems related to the national economy and people's livelihood, such as power operation system, rail transit system, and aerospace system, have been connected to the network, and software has become a key to the normal operation of IoT. However, frequent software failures have caused the problem of "trustworthy crisis" [1-3] in software. For example, due to a line of code error, the blockchain project YAM worth 500 million dollars https://news.bitcoin.com/new-defi-yield-farming-project-yam- finance-sees-460-million-locked-in-17-hours/ +2 https://www.space.com/china-far-side-moon-rover-strange- substance.html +978-1-6654-3431-7/21/$31.00 ©2021 IEEE 44 +DOI 10.1109/ISSSR53171.2021.00012 + was closed on August 12, 2020. Because of insufficient testing, the SpaceX rocket of the US Space Exploration Technology Company exploded when it was returned on the ground on February 2, 2021 [4]. Therefore, once the IoT system runs incorrectly or is maliciously manipulated, the consequences will be unimaginable. +In the past, software testing is an effective way to detect software faults and improve software quality [5]. However, IoT systems often run in an extremely complex environment. Thus, it is an impossible task to test them completely. For example, due to the harsh space environment on the moon, +China’s Yutu lunar2 rover was paralyzed on the lunar surface after less than two months of operations. This indicates that the previous software and hardware test for Yutu lunar rover was insufficient. In addition, one IoT system often has complex interactions with other IoT systems. If we stop a running IoT system and test it, it is likely to affect the normal operation of other IoT systems, resulting in huge economic losses. However, the traditional software testing methods, such as unit testing, integration testing, system testing, and acceptance testing, are difficult to effectively solve the above two problems because it is impossible to exhaustively test IoT systems. Therefore, industry and academia urgently need to study new methods of software testing to improve the quality of IoT systems. +Recently, some scholars proposed a novel software testing method based on big data technology [6-8]. This testing method lies on the emphasis of the analysis of software running logs [9,10] or user operation data recorded by the software to detect software faults or software design defects. As the running time of the software increases, the system logs or the data recorded by the system will contain a large number of system operation information. If we regard these massive operations on the system as the software testing process, the system has already completed the massive testing, and software faults and software design defects must be recorded in the data. Therefore, these faults and defects can be detected from the data if big data mining techniques are effective. This test method is also suitable for detecting software faults and design defects of IoT systems. First of all, the IoT system will generate a large amount of data, such as sensor data, system logs, and system forum data. By mining these data, we can detect software faults and software design defects. For example, we have realized the performance test of the networking efficiency of apps and found a small number of network failure events of WeChat by analyzing its networking data [11]. Secondly, the operation of the IoT system can be optimized according to the result of data analysis. For example, Al-Ali et. al [12] improved the smart home management system through the big data analysis of the smart home, and improved the user’s experience of the smart home. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +The paper discusses big data-based testing, and compares this test method with traditional software testing methods in the software life cycle. Then, we also discuss the challenges of applying big data-based testing to ensure the reliability of IOT systems. Finally, some future research directions for big data-based testing are given to ensure the reliability of IoT systems. +The contributions of the paper include: +(1) We discussed the evolution of the software life cycle and the relationship between traditional software testing methods and big data-based testing. Then, we constructed four models to describe the evolution process of the software life cycle. +(2) We summarized the three challenges of big data- based testing to ensure the reliability of IoT systems. +(3) We presented five future research directions for big data-based testing. +II. BIG DATA-BASED TESTING +A. Software Life Cycle +software release phase, software maintenance and update phase, and software obsolescence phase, as shown in Fig. 1 (a). From Fig. 1 (a), software development is accompanied by software testing in the past. If we consider iteration of software multiple versions, software life cycle can be represented by the model in Fig. 1 (b). If we consider the interaction between users and software, software life cycle can be described by the model in Fig. 1 (c). After using the software, users will put forward some suggestions for the improvement of the software according to their own habits. Programmers can update the software according to these user requirements, and then the next software version will be released. However, there are two difficulties in achieving the above process. First, not all users of software can express clearly what software requirements need to be improved. Second, users of the software may not be able to observe all software faults and software design defects. Therefore, we need to study the new and non-manually method to generate the software update requirement report. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +45 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +Generally, software life cycle [13,14] can be arbitrarily divided into software development and testing phase, +(a) software development maintenance and software +software Release +and testing upgrade obsolescence +iteration evolution +(b) software development maintenance and software +version Release +and testing upgrade obsolescence +iteration evolution +(c) software development software upgrade software +version Release customer use +and testing requirement obsolescence +iteration evolution +(d) software development software upgrade software +version Release customer use +and testing requirement obsolescence +big data fault and defect +data collection +analysis mining +Figure 1. Four models for describing the evolution of the software life cycle + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +Because an amount of data is generated from the IoT system, we can collect them and use big data technology to deal with them. Thus, it is possible to dig out software faults and software design defects from the data. We can construct a new model shown in Fig .1 (d) to describe the software life cycle. From Fig. 1 (d), data collection, big data analysis, and data mining are used to detect software faults and software design defects so as to generate the software update report. The test method is called big data-based testing. Its core idea is to use big data technology to mine software faults and software design defects that are not found by traditional software testing methods in the software life cycle. +Note: in practice, big data-based testing cannot replace those traditional software testing methods. Even if software faults and software design defects are detected, software testers still need to use some traditional software testing methods to fix them. +B. Characteristics +Comparing to traditional software testing methods, big data-based testing has the following characteristics: +(1) Big data-based testing is implemented after the software is released. +(2) Big data-based testing does not require testers to design and execute test cases, but to detect software faults and design defects by collecting and analyzing data. Therefore, the cost of software testing is saved. +(3) Big data-based testing is a data-driven testing method, that is, this testing method depends on the availability of the data generated by the software and the effectiveness of the data acquisition, filtering and analysis methods. +(4) After software faults are detected by big data-based testing, the traditional software testing methods also need to be used to fix software faults and software design defects. +(5) Big data-based testing can not only find software faults, but also detect software design defects, which is difficult to achieve by traditional software testing methods. +C. Comparison +The relationship between traditional software testing methods and big data-based testing is shown in Fig. 2. From Fig. 2, traditional software testing methods and big data- based testing are both part of the software life cycle. Traditional software testing methods are completed before the software is officially released, while big data-based testing is completed after the software is released. Therefore, both traditional software testing methods and big data testing realize the whole process testing of the software life cycle. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +46 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +traditional software testing methods big data-based testing +software +testers test cases test execution life cycle data collection data analysis +bug fix fault and defect mining +before software release after software release +Figure 2. The relationship between traditional software testing methods and big data-based testing + +Item Traditional software testing methods Big data-based testing bug fix yes no software design defect no yes Table 1 shows the difference between traditional software testing methods and big data-based software testing. From Table 1, traditional software testing methods are to find software bugs by executing test cases. Therefore, these test methods usually require testers to design test cases and execute test cases. Compared with traditional software testing methods, big data-based software testing requires data analysts to collect data, analyze data, and mine software faults and defects in software design. In addition, both traditional software testing methods and big data-based testing can detect software faults. Traditional software testing methods can fix software bugs, but cannot find defects in software design. Big data-based testing can detect defects in software design, but it is difficult to locate and fix software faults. +III. CHALLENGES +By collecting and analyzing the relevant data generated by the IoT systems, software faults and software design +defects can be discovered. Then, we can model software behaviors to simulate the usage scenario of software that +triggers software faults or displays software design defects. Next, exception execution paths of software are generated +from the model using model-based testing. Finally, we can instantiate test cases of these paths to reappear software bugs +TABLE I. +COMPARISON OF TRAD- ITIONAL SOFTWARE TESTING and design defects in the IoT system. To realize the above METHODS AND BIG DATA BASED TESTING process, there are still some challenges in big data-based +Item Traditional software testing methods Big data-based testing method execution of test cases data collection, analysis and data mining staff testers data analyst phase in the soft. life cycle before software release after software release software fault detection yes yes testing. +Challenge 1: How to analyze the data generated by the IoT systems so that valid data can be retained to realize the mining of software bugs and design defects? +The IoT systems generate massive amount of data every day and most of the data are invalid and redundant [15], which leads to the surge of data storage cost and the difficulty of data analysis [6]. Thus, we need to construct a data filtering model to filter invalid and redundant data. Before adopting the big data analysis technologies, we cannot predict whether there are software bugs or design defects in the IoT system. So, it is an unwise choice to + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +47 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +analyze all the data directly. To solve this problem, researchers put forward the data sampling analysis method [11,12]. The main idea of the proposed method is to first select part of data from the whole data to conduct data analysis. If software faults or software design defects can be found, it indicates that the data filtering model and data analysis method are effective. Then, according to the 2-8 law, we can use the data filtering model and the data analysis method to mine all data. Otherwise, we need to redesign the data filtering model and apply a new data analysis method to deal with the data. Sampling analysis method can be applied to analyze mass data, but the difficulty of applying the method lies in choosing of the right sampling strategy and constructing of the effective data filtering model. In the future, the data sampling strategies and new data filtering models will be two research directions to realize the detection of both software faults and software design defects with the low cost of data analysis. +Challenge 2: What kind of model can be constructed to simulate the behavioral characteristics of users using the software in a complex scenario? +Once software faults or software design defects are found, we need to reproduce these faults and defects so that programmers can repair them. However, IoT systems are often used in a very complex application scenario, and there may also be complex interactions between users and systems. Therefore, it is a key for reproducing software faults and software design defects to construct a model to accurately describe the interaction between users and IoT systems. Generally, software behaviors include not only traditional operations such as concatenation, selection, and loop, but also operations such as synchronization, concurrency and alternation between multiple operations [3,16]. Thus, to model complex software behaviors, we need to consider the testability of the selected model so that it is easy to generate test paths from the model and instantiate test cases from test paths [17]. In the past, finite state machine (FSM [18-21]) was usually used to model software behaviors. However, because FSM does not support synchronization and concurrency operations [16], it cannot simulate all software behaviors in IoT systems. To enhance the modeling ability of FSM, extended finite state machine (EFSM [22,23]) and extended regular expression (ERE [16,24,25]) models have been proposed to model software behaviors. These models not only have more powerful modeling capabilities than FSM, but also generate test paths from the models easily. The difficulty in using EFSM and ERE models lies in the lack of modeling tools that can be used in industry. Although a few tools, such as MTTool [2], CREST [23], and SDL [26], were developed to support modeling and test generation for EFSM or ERE, these tools still have shortcomings in the multi-level modeling of large-scale complex systems. +Challenge 3: How to quickly locate software bugs and design defects in program statements so as to assist programmers in fixing them? +Model-based testing [21,27-29]can produce the expected execution path and expected result of the software running. Then, we can detect software faults by observing inconsistent between the model and the actual software. +However, this test method does not involve a single line of code. As a result, it is hard to locate software faults in the program. Combining model-based testing methods and program slicing technology [30,31]may be a way to realize the location of software faults and design defects in the future. +IV. FUTURE DIRECTION +Due to the difficulty of simulating the operating environment of the IoT systems exhaustively, it is hard for IoT systems to realize sufficient testing. Through the collection and analysis of data generated from the IoT system, software faults and design defects in the IoT system can be discovered. To realize this purpose, there are still some researches that need to be carried out in the future. +a) Intent-based data collection method +The data generated from IoT systems [32]includes: 1) the Web log on the server that records the user's various operations on the software, 2) software error information that is submitted by the user after the software crashes, 3) various operating data of the user to the software, and 4) forum data of the IoT system. Recording all the data will increase the cost of data storage, and a large amount of invalid data will also lead to the failure of big data analysis. In the past, people usually cleaned and formatted those collected big data, and then analyzed them. Therefore, the intention-based data collection method needs to be used to reduce the collected data. To realize the intention-based data collection method, we need to study the classifications of test intent. For example, to find software design defects, we should eliminate those data including standardized operations that follow the software design requirements using a data filtering model because these operations to software have been tested in traditional software testing methods. The defects in software design often come from users’ non-standard operations. Thus, the data including non-standard operations need to be collected in this test intent. In the future, different data collection methods for different test intents, including software design defects, software performance, and software application areas, will need to be studied. +b) Analysis methods for unstructured data +Generally, the data that records users’ use of the software are mostly unstructured data, such as log data. To analyze unstructured data, we need to perform field extraction, syntactic analysis, and semantic analysis on the collected data. Therefore, for analysis and research on unstructured data, in the future, there are the three research directions, including massive data incremental sampling analysis method, the extended regular expression modeling method of unstructured data, and the software fault mining method using extended regular expression model. +Before using big data analysis methods to dig out software faults and software design defects, we can neither predict that the software contains faults or defects, nor predict which data mining methods that will surely detect software faults and software design defects. Aimless data analysis will lead to the increase of the data analysis cost. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +49 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +Thus, it is necessary to screen out the data that can be used to find software faults. An effective data analysis method can discover software faults with the low cost. Currently, the incremental sampling analysis method is an effective data collection strategy with the low cost. In the future, it will be necessary to study the selection strategies of the data, the conditions for terminating data selection, the analytical methods of data characteristics, and the construction method of the data filtering model. +In the past, to extract information from unstructured data, we used the regular expression to model data features. Then, effective information can be filtered and extracted from the massive data according to this model. Although this method is very effective for the data with obvious features, it is hard for regular expressions to describe those data with complex relationship among data features. Therefore, extended regular expression needs to be studied to solve this problem in the future. +c) Modeling tool based on regular expression +After constructing the extended regular expression model for filtering the massive data, we also need to solve a key problem that is a supported tool for modeling extended regular expression. Currently, most of the existing data analysis tools support the processing and analysis of regular expression, but do not support the processing and analysis of extended regular expression. In the future, the modeling theory of extended regular expression and the conversion rules from the model to test paths need to be studied. The difficulty of this research is how to ensure the validity of the transformation from the extended regular expression model to a group of sub regular expression models. +d) Software behavior modeling +In the past, to simulate software behaviors, researchers usually need to build models such as FSM, label transition system, and Petri net [32]. However, the relationship between software behaviors in the Internet of things is very complex, such as concurrency and synchronization, which leads to the modeling failure of FSM and label transition system. To model software behaviors in the IoT, it is necessary to clarify the interaction between users and software, such as whether the concurrent operation is between users, how the server responds to these operations, whether the user operation meets the business process and so on. +e) Software fault location combining model-based testing and program slicing technique +Through data mining, software faults or software design defects can be found. Then, we can get execution paths using model-based testing for reproducing software faults and design defects in IoT system. To help programmers fixing software faults and design defects, we also need to locate software faults in the program. In the past, programmers usually used program slicing technique to locate software faults. Therefore, how to combine model-based testing and program slicing technique to find software faults is one of the future research directions. +V. CONCLUSION +Generally, the IoT system runs in a very complex environment, so it is difficult to realize the complete test of the IoT system in traditional software methods. As a result, it is hard to ensure the reliability of the IoT system by using the way of software testing. To improve the reliability of the IoT system, we recommend big data-based testing. Because the IoT system will produce a large amount of data, including system operation data, user interaction data, sensor data, etc., we can detect potential software faults or software design defects by mining these data. Currently, there are a number of online data sources3,4,5 available to realize software defect detection. This paper discusses the characteristics of big data-based testing, and compares this method with traditional software testing methods. Then, this paper presents the current challenges of big data-based testing, and gives the future research directions of this method. The work in this paper has a very important reference for the promotion and application of big data-based testing. +REFERENCES +[1] V. V. G. Neto, "A model-based approach towards the building of trustworthy software-intensive systems-of-systems," in 2017 IEEE/ACM 39th International Conference on Software Engineering Companion (ICSE-C), 2017, pp. 425-428. +[2] P. Liu and Z. Xu, "MTTool: A Tool for Software Modeling and Test Generation," IEEE Access, vol. 6, pp. 56222-56237, 2018. +[3] X. Cheng, Y. Wang, W. Zhou, X. Wang, and J. Wang, “Software fault detection for sequencing constraint defects,” International Journal of Performability Engineering, vol. 16, no. 11, pp. 1814–1825, November 2020. +[4] L. Dawson, "Technological Risks of Space Flights and Human Casualties," in The Politics and Perils of Space Exploration, ed: Springer, 2021, pp. 225-241. +[5] S. Masuda, K. Ono, T. Yasue, and N. Hosokawa, "A survey of software quality for machine learning applications," in 2018 IEEE International conference on software testing, verification and validation workshops (ICSTW), 2018, pp. 279-284. +[6] A. Miranskyy, A. Hamou-Lhadj, E. Cialini, and A. Larsson, "Operational-log analysis for big data systems: Challenges and solutions," IEEE Software, vol. 33, pp. 52-59, 2016. +[7] J.-G. Lou, Q. Fu, S. Yang, Y. Xu, and J. Li, "Mining Invariants from Console Logs for System Problem Detection," in USENIX Annual Technical Conference, 2010, pp. 1-14. +[8] X. Zhang, Y. Xu, Q. Lin, B. Qiao, H. Zhang, Y. Dang, C. Xie, X. Yang, Q. Cheng, and Z. Li, "Robust log-based anomaly detection on unstable log data," in Proceedings of the 2019 27th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering, 2019, pp. 807-817. +[9] R. Abbas, Z. Sultan, and S. N. Bhatti, "Comparative analysis of automated load testing tools: Apache jmeter, microsoft visual studio (tfs), loadrunner, siege," in 2017 International Conference on Communication Technologies (ComTech), 2017, pp. 39-44. +[10] Y.-J. Chen and H.-Y. Chien, "IoT-based green house system with splunk data analysis," in 2017 IEEE 8th International Conference on Awareness Science and Technology (iCAST), 2017, pp. 260-263. +[11] P. Liu, "Big Data Testing Technology: data collection, analysis, and test practice," Posts and Telecom Press, 2018. (in Chinese) +3 https://academic.oup.com/nar/article/46/D1/D14/4316108 4 https://sir.csc.ncsu.edu/portal/index.php +5 https://www.kaggle.com/ + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +50 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. + +[12] X. Wu, X. Zhu, G.-Q. Wu, and W. Ding, "Data mining with big data," IEEE transactions on knowledge and data engineering, vol. 26, pp. 97-107, 2014. +[13] V. T. Rajlich and K. H. Bennett, "A staged model for the software life cycle," Computer, vol. 33, pp. 66-71, 2000. +[14] T. R. D. Saputri and S.-W. Lee, "Integrated framework for incorporating sustainability design in software engineering life-cycle: An empirical study," Information and Software Technology, vol. 129, +p. 106407, 2021. +[15] M. Gudipati, S. Rao, N. D. Mohan, and N. K. Gajja, "Big data: Testing approach to overcome quality challenges," Big Data: Challenges and Opportunities, vol. 11, pp. 65-72, 2013. +[16] P. Liu and H. Miao, "Theory of Test Modeling Based on Regular Expressions," in Structured Object-Oriented Formal Language and Method, ed: Springer, 2014, pp. 17-31. +[17] P. Liu, H.-K. Miao, H.-W. Zeng, and Y. Liu, "FSM-based testing: Theory, method and evaluation," Jisuanji Xuebao(Chinese Journal of Computers), vol. 34, pp. 965-984, 2011. +[18] A. A. Andrews, J. Offutt, and R. T. Alexander, "Testing Web applications by modeling with FSMs," Software & Systems Modeling, vol. 4, pp. 326-345, 2005. +[19] W. Li, F. L. Gall, and N. Spaseski, "A Survey on Model-Based Testing Tools for Test Case Generation," in International Conference on Tools and Methods for Program Analysis, 2017, pp. 77-89. +[20] C. Gaston and D. Seifert, "Model-Based Testing of Reactive Systems. Advanced Lectures, chapter Evaluating coverage based testing," ed: Springer-Verlag, Berlin, 2005. +[21] P. Liu, Y. Li, and Z. Li, "Some Thoughts on Model-Based Test Optimization," in 2019 IEEE 19th International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2019, pp. 268-274. +[22] Y. Chen, A. Wang, J. Wang, L. Liu, Y. Song, and Q. Ha, "Automatic Test Transition Paths Generation Approach from EFSM Using State Tree," in 2018 IEEE International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2018, pp. 87-93. +[23] K. Androutsopoulos, N. Gold, M. Harman, Z. Li, and L. Tratt, "A theoretical and empirical study of EFSM dependence," in 2009 IEEE +International Conference on Software Maintenance, 2009, pp. 287- 296. +[24] P. Liu, J. Ai, and Z. J. Xu, "A study for extended regular expression- based testing," in Computer and Information Science (ICIS), 2017 IEEE/ACIS 16th International Conference on, 2017, pp. 821-826. +[25] O. Kilinccceker, E. Turk, M. Challenger, and F. Belli, "Regular Expression Based Test Sequence Generation for HDL Program Validation," in 2018 IEEE International Conference on Software Quality, Reliability and Security Companion (QRS-C), 2018, pp. 585- 592. +[26] W. E. Wong, T. Sugeta, J. J. Li, and J. C. Maldonado, "Coverage testing software architectural design in SDL," Computer Networks, vol. 42, pp. 359-374, 2003. +[27] F. Abbors, T. Ahmad, D. Truscan, and I. Porres, "MBPeT: a model- based performance testing tool," in 2012 Fourth International Conference on Advances in System Testing and Validation Lifecycle, 2012. +[28] A. Aerts, M. R. Mousavi, and M. Reniers, "A Tool Prototype for Model-Based Testing of Cyber-Physical Systems," vol. 9399, pp. 563-572, 2015. +[29] M. Markthaler, S. Kriebel, K. S. Salman, T. Greifenberg, S. Hillemacher, B. Rumpe, C. Schulze, A. Wortmann, P. Orth, and J. Richenhagen, "Improving model-based testing in automotive software engineering," in 2018 IEEE/ACM 40th International Conference on Software Engineering: Software Engineering in Practice Track (ICSE-SEIP), 2018, pp. 172-180. +[30] N. AlAbwaini, A. Aldaaje, T. Jaber, M. Abdallah, and A. Tamimi, "Using Program Slicing to Detect the Dead Code," in 2018 8th International Conference on Computer Science and Information Technology (CSIT), 2018, pp. 230-233. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +51 +Authorized licensed use limited to: UNIVERSIDADE ESTADUAL DO CEARA. Downloaded on February 16,2024 at 13:15:03 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt b/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt new file mode 100644 index 0000000..59ee7cf --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/46-SIM-PIPE DryRunner An approach for testing.txt @@ -0,0 +1,131 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +© 2022 IEEE. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertising or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works. +SIM-PIPE DryRunner: An approach for testing container-based big data pipelines and generating simulation data + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Aleena Thomas SINTEF AS +Oslo, Norway +Aleena.Thomas@sintef.no +Dumitru Roman SINTEF AS Oslo, Norway +Dumitru.Roman@sintef.no +Nikolay Nikolov SINTEF AS +Oslo, Norway +Nikolay.Nikolov@sintef.no +Brian Elves ter SINTEF AS +Oslo, Norway +Brian.Elves ter@sintef.no +Antoine Pultier SINTEF AS +Oslo, Norway +Antoine.Pultier@sintef.no +Ahmet Soylu +Oslo Metropolitan University Oslo, Norway +Ahmet.Soylu@oslomet.no + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Abstract—Big data pipelines are becoming increasingly vital in a wide range of data intensive application domains such as digital healthcare, telecommunication, and manufacturing for efficiently processing data. Data pipelines in such domains are complex and dynamic and involve a number of data processing steps that are deployed on heterogeneous computing resources under the realm of the Edge-Cloud paradigm. The processes of testing and simulating big data pipelines on heterogeneous resources need to be able to accurately represent this complexity. However, since big data processing is heavily resource-intensive, it makes testing and simulation based on historical execution data impractical. In this paper, we introduce the SIM-PIPE DryRunner approach – a dry run approach that deploys a big data pipeline step by step in an isolated environment and executes it with sample data; this approach could be used for testing big data pipelines and realising practical simulations using existing simulators. +Index Terms—Big data pipelines; Dry run; Software contain- ers; Sandbox; Testing; Simulation +I. INTRODUCTION +The need for supporting big data pipeline processing is increasing rapidly with more and more applications running on the Cloud and large IoT systems handling huge volumes of data [1]. Big data pipelines are designed to handle large amounts of streaming and batch processing data and are be- coming indispensable in a wide variety of application domains +[2]. One of the main challenges in managing big data pipelines is analyzing the behaviour of different pipeline steps in order to deploy them in a cost-effective manner. Since deploying computing resources for these pipelines is expensive, it is crucial to adjust the deployment parameters for optimized ex- ecution and to ensure only required resources are provisioned +[3]. Therefore, one of the key aspects of the big data pipeline lifecycle relates to testing and simulation before deployment in a production setting [4]. Testing refers to executing steps in a pipeline according to its definition,whereas simulation focuses on estimating the performance of the pipeline in the actual +computing infrastructure by predicting the performance of the pipeline given the execution parameters. An efficient mean of testing and simulating pipelines before deployment allows identifying errors and bottlenecks early and addressing them before provisioning expensive computing resources in the actual production environment on the Cloud-Edge continuum. There are multiple simulation solutions for big data pipelines (e.g., [5]–[7]). One of the main challenges with the simulators is that most of the existing approaches rely on results from previous runs of pipelines or analyses by an expert in order to make predictions [4]. In the case of big data, predicting performance using previous runs is likely to result in high costs if the pipeline is highly computing-intensive. Big data pipelines are complex and dynamic processes built to run on top of a multitude of heterogeneous services and computing resources, which makes prediction of their performance a challenge [2]. To this end, we propose an approach—SIM- PIPE DryRunner—based on dry running of big data pipelines. We describe dry running of big data pipelines as the execution of a pipeline using a sample or smaller input data size (compared to the full-scale big data) on a test environment as opposed to using the infrastructure for production deployment. The overall approach is depicted in Figure 1. We assume that the resource usage metrics for the dry run of the pipeline on a representative set of small input data can be used in the analysis of its behaviour for large amounts of input data. The proposed approach deploys each step in the correct order in an isolated testing environment, hereafter called a sandbox. We use an isolated environment (e.g., a virtual machine) for the dry run, since it can reduce interference from other running applications and ensures better estimates of the performance for the pipelines. The approach enables one to run the pipeline and analyze it in a lower cost environment than simulators, which do additional processing to simulate the actual computing environment like the Cloud or Edge + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +This is the author accepted version of an article published in +2022 IEEE 46th Annual Computers, Software, and Applications Conference (COMPSAC) https://doi.org/10.1109/COMPSAC54236.2022.00182 + +Fig. 1. Dry run approach for testing and simulating big data pipelines. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +where it will be deployed in production. The approach, firstly, could be used to check the correctness of the pipeline and to ensure that the pipeline is working as expected and producing the expected output. Secondly, dry run results can be used in simulators to aid in predicting the performance of the pipeline and identify possible bottlenecks. Thereby, the dry run result of the pipeline for a small data size may be used to predict the performance for bigger data sizes, assuming that the data are processed in chunks/slices. For example, metrics collected by dry running with different chunk sizes can be used to estimate infrastructure resources required for scaling the pipeline (e.g, CPU, memory and disk size, and using multiple processes). Software container technologies could simplify the execution of data pipelines [8] both in isolated and production envi- ronments by encapsulating individual data pipeline steps in platform and programming language independent containers. In this paper, we describe the proposed dry run approach and present a tool—the SIM-PIPE DryRunner tool—implementing the approach. The overall SIM-PIPE solution aims at using the dry run results for testing the pipelines and simulating them using existing simulators. +The rest of the paper is organized as follows. Section II provides the description of our approach as well as the technical architecture and implementation. In Section III, we present a use case for the proposed approach, while Section IV presents related work. In Section V, we summarize our approach and provide directions for future work. +II. SIM-PIPE DRYRUNNER APPROACH +The proposed approach based on dry running of big data pipelines relies on the use of an isolated sandbox environment to execute pipeline steps. By maintaining an isolated testing environment, we are able to get an estimate of the resource usage of each step without interference from other running processes. Moreover, the container-based implementation of the step facilitates accurate estimation of its total execution time in the actual deployment infrastructure. This is due to the homogeneity of container technologies, which ensures that the execution of the container is reproducible regardless of the computing infrastructure in which it is executed. Thus, by running the container-based implementations of the pipeline steps, we ensure that we obtain values from dry run, which +can be used to predict how the pipeline behaves on resources on the Cloud-Edge continuum. +Figure 2 shows the main steps of the dry run process. Once a dry run is initiated, a step in the pipeline and sample data are deployed to the sandbox using a container. During the execution of the step, execution time will be recorded and the sandbox will be continuously pooled for metrics about the execution. These metrics are stored for later use. Once the step has successfully performed the data processing task, the resulting data will be retrieved, the running step will be removed from the sandbox, and the same process will be repeated for the next steps (i.e., deploy the step and feed it with the resulting data from the previous one). Based on the data gathered, analytics will be performed to derive results that apply to the entire pipeline. The pipeline steps, in case of steps performing batch processing, are provided with a sample input to be used during the dry run. In case of steps which perform continuous processing, there is a user definedoption to provide the number of seconds to wait before the step is terminated, this ensures that the correctness of the step and recording of resource usage metrics can be done for that specified amount of time. All the details including resource usage statistics, inputs to the steps, and outputs of the execution are stored and eventually used to perform resource usage analytics. +In the following we describes the technical architecture and implementation of the SIM-PIPE DryRunner tool, and outline +a typical use of the tool. +A. Technical Architecture and Implementation +In order to demonstrate the feasibility of the approach for dry running of big data pipelines, we designed and imple- mented a prototype application—the SIM-PIPE DryRunner tool. It consists of several components that are deployed sepa- rately in order to ensure an appropriate execution environment for the dry run approach. The current version of the tool, along with installation instructions are available on GitHub1. +Figure 3 shows the deployment topology and architecture for SIM-PIPE DryRunner tool. The tool is designed to be de- ployed in two separate hosts: one for hosting the front-end and business logic, and one for hosting the sandbox environment. The main component is the dry run controller, which performs a step-wise analysis of the pipeline by deploying steps and +1https://github.com/DataCloud-project/SIM-PIPE + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + + +Fig. 2. The SIM-PIPE DryRunner process for testing and collecting performance data. + +Fig. 3. SIM-PIPE DryRunner tool: deployment topology and architecture. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +collecting relevant data. Host 1 in Figure 2 contains the dry run controller and REST service (which serves the front-end of the implementation) as well as the dry run data storage, which is implemented using TimescaleDB2. In our implementation, these sub-components are deployed on the host using Docker containers. The necessary files for providing the input and storing the output of each step are transmitted and stored using an SFTP server which also runs in a Docker container in host 2. When deploying a step to be analyzed, the dry run controller sends (if needed) data over SFTP to the sandbox host, which makes it available to the container and executes the step. +The dry run controller and REST service are implemented using NodeJS3 and use a number of NodeJS libraries related to +2https://www.timescale.com 3https://nodejs.org +managing the execution of containers on a target host, namely dockerode4 for container execution control in the sandbox and ssh2-sftp-client5 for interacting with the SFTP server on the sandbox. The REST API is developed using GraphQL6 (a query language for APIs). Hasura7 is used to develop and +connect to the data model of the dry run data storage. The front-end of the SIM-PIPE DryRunner tool is implemented using Appsmith8. +The current version of the SIM-PIPE DryRunner tool user interface is depicted in Figure 4. The interface displays a list of +4https://github.com/apocas/dockerode 5https://github.com/theophilusx/ssh2-sftp-client 6https://graphql.org +7https://hasura.io +8https://www.appsmith.com +dry runs tied with a specific pipeline as well as the associated runs to each dry run. For each run, it displays the run state (“Waiting”, “Queued”, “Active”, “Completed”, “Failed”, or “Cancelled”) as well as statistics on each of the steps. The statistics include the used CPU, memory, network, and running time. In addition to the statistics, the current version of the user interface displays logs from the execution of the steps. The tool assumes that the pipeline description is provided in the form of a Domain Specific Language (DSL) which is described in a Github repository9. This DSL has been developed as part of the DEF-PIPE tool which is a GUI (Graphical user Interface) based tool to design, implement and store big data pipelines. More details and usage guidelines of this tool are given in a Github repository10. +The current implementation supports explicitly step imple- mentations as described in the big data pipeline approach in [9], whereby each container collects input data, stores output data, and any intermediate data separately in a file system. Thereby, the SIM-PIPE DryRunner tool provides input data to the steps and stores intermediate step outputs for analysing the dry run. Other step implementations that do not use file-based data transmission are also applicable, but the data delivery system currently does not support this. +The dry run data storage uses a relational database model and records each dry run with a timestamp and pipeline identifier. Each run is also associated with the DSL model that was used when the run was started as well as its (current) status and the timestamps when the run was created, started, and ended. Each run stores data for each of the steps that are in the input DSL model with the step name, status, and metrics about the used CPU and memory. Intermediate data are stored on disk in a file system that are marked with the pipeline identifier, run identifier, and step number and can be served on request to the front-end. +B. Using the SIM-PIPE DryRunner tool +Dry run using the SIM-PIPE DryRunner tool is done through the following steps: +• First, the user creates a new dry run for a pipeline by providing its DSL description and sample input data using the SIM-PIPE DryRunner tool UI. +• The user starts a new dry run and the current status of the run and each step is displayed in the UI. +• After each step has completed execution indicated by its status, the user can click on the step to view the logs generated during execution, CPU usage percentage, network usage, memory usage and maximum memory usage over time. +• In case of failure of a step, the status of the step and correspondingly run would indicate failure status, and only the logs would be displayed which may help in debugging. +9https://github.com/DataCloud-project/DEF-PIPE-DSL 10https://github.com/DataCloud-project/DEF-PIPE +• The step can also be stopped while running, and this stops the current step and all the succeeding steps in the pipeline. +III. USE CASE +The SIM-PIPE DryRunner tool was tested on data pipelines in the context of a digital health system, where developers and data engineers are using data pipelines to implement different e-health services. The main objective of the digital health sys- tem is to monitor, support and help patients, especially elderly, at their homes, remotely. The system uses data pipelines to gather sensor data (e.g., welfare sensors and medical devices) from the patients, store and process the patient data, and provide relevant data to the right stakeholder at the right time (e.g., notifications of events to healthcare providers, storing data in electronic health records, and providing data and notifications to third party health systems). +Figure 5 illustrates a generic digital health data pipeline that involves three steps: 1) Data generation, pre-processing and routing, 2) Data storage and analysis, and 3) End user application logic. The first step is deployed on the Edge, while the two latter are deployed on the Cloud. The steps are the same three steps shown in the SIM-PIPE DryRunner tool UI in Figure 4. The first step involves collecting and formatting sensor data from healthcare sensors and medical devices that the patient uses. The second step involves storing the data and checking it against the patient plan. The third step involves different types of end user application logic, such as notifying healthcare providers and submitting reports to 3rd party healthcare systems. +Several instances and variants of data pipelines are deployed in the digital health use case. There are pipeline instances for each patient. Some of the challenges in managing the various variants of pipelines relates to i) scaling individual steps of the pipeline, ii) the need to build new applications for each new type of sensor, and iii) finding the optimal resource allocation for data processing steps. The SIM-PIPE DryRunner tool is used to address these challenges, allowing the developers and data engineers of the digital health data pipelines to test new variants of the pipelines without deployment on production infrastructure in order to identify trouble spots and bottlenecks early, as well as better understand the resource requirements required from the metrics collected by the SIM- PIPE DryRunner tool. +IV. RELATED WORK +There are several simulation approaches for data pipelines that include tools to simulate big data pipelines, such as the event-based simulator GroudSim [5], and process-based simulators GridSim [6] and CloudSim [7]. Despite the number of simulation approaches in literature, there are few that can be used for testing and simulation of big data pipelines. Liu et al. [10] present a survey of scientific workflow management systems in the context of big data pipelines, out of the five + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + + +Fig. 4. SIM-PIPE DryRunner tool front-end. + +Fig. 5. SIM-PIPE DryRunner tool front-end. +systems presented only two of them (Tavernahttps://incubator.apache.org/projects/taverna.html +, Swifthttps://github.com/square/workflow-swift +) had a system for container-based big data pipelines and supports simulation or testing component. While Taverna is specialized design, composition, configuration, orchestration, enactment, to support bio-informatics pipelines, Swift only provides tools and validation of end-to-end big data analytic services. Each for unit and integration testing of pipelines. These simulators step in the input pipeline is provided in the form of one of vary in ways in which they accept data for simulating a the four predefined containerized application images (named pipeline. Many of them run pipelines multiple times and the as Apps) which is part of their microservices architecture. results from the runs are used in simulation [11]. Though it handles several types of big data workflows, it is +Iatropoulou et al. [12] present a data pipeline management not open source and thus cannot be extended. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +V. CONCLUSIONS AND OUTLOOK +We proposed a new approach—SIM-PIPE DryRunner—for dry running of big data pipelines using an isolated sandbox for deployment of steps. Testing and simulation of big data pipelines is challenging, since the existing methods depend on information from previous runs or domain expert knowledge, which are difficult to acquire in case of big data pipelines. We also developed an initial version of the tool—the SIM-PIPE DryRunner tool—with a user interface in which the pipeline designer can input and dry run big data pipelines and view the results of the resource usage of step execution and logs. The dry run results of the big data pipeline can be used in existing simulators by bringing them into the respective format that can be used as input. One limitation of this method is that it assumes that the big data pipelines have container-based implementations. +In the future, we aim to enable the SIM-PIPE DryRunner tool to recommend minimum requirements for the resources necessary to run the pipeline steps successfully (i.e., the minimum memory and CPU requirements) and to provide an estimation of the optimal horizontal scaling for each individual step that will allow for executing the pipeline without bottlenecks. Future work also involves extending it further by integrating advanced analytics for the results obtained from the sandbox. This involves predicting the resource usage performance and total execution time of the pipeline when a given input size is specified. We also aim to analyze and quantify the impact of parallelisms for various pipeline steps. This can be used in configuring the resources at deployment or in scheduling algorithms. Finally, we also plan to use the dry run results in existing simulators. This requires investigation of input formats which is accepted by these simulators and conversion of the output of our tool into a format that is usable by them. +Acknowledgements. This work received partial funding from the European Commission Horizon 2020 DataCloud project (grant number 101016835), the NFR BigDataMine project (grant number 309691), and the SINTEF internally funded SEP DataPipes project. +REFERENCES +[1] R. Buyya, S. N. Srirama, G. Casale, R. Calheiros, Y. Simmhan, +B. Varghese, E. Gelenbe, B. Javadi, L. M. Vaquero, M. A. S. Netto, +A. N. Toosi, M. A. Rodriguez, I. M. Llorente, S. D. C. D. Vimercati, +P. Samarati, D. Milojicic, C. Varela, R. Bahsoon, M. D. D. Assuncao, +O. Rana, W. Zhou, H. Jin, W. Gentzsch, A. Y. Zomaya, and H. Shen, “A manifesto for future generation cloud computing: Research directions for the next decade,” ACM Computing Surveys, vol. 51, no. 5, 2018. +[2] M. Barika, S. Garg, A. Y. Zomaya, L. Wang, A. V. Moorsel, and +R. Ranjan, “Orchestrating big data analysis workflows in the cloud: Research challenges, survey, and future directions,” ACM Computing Surveys, vol. 52, no. 5, 2019. +[3] A. Shakarami, H. Shakarami, M. Ghobaei-Arani, E. Nikougoftar, and +R. Faraji-Mehmandar, “Resource provisioning in edge/fog computing: A comprehensive and systematic review,” Journal of Systems Architecture, vol. 122, p. 102362, 2022. +[4] I. Bambrik, “A survey on cloud computing simulation and modeling,” SN Computer Science, vol. 1, no. 5, p. 249, 2020. +[5] S. Ostermann, K. Plankensteiner, R. Prodan, and T. Fahringer, “Groudsim: An event-based simulation framework for computational grids and clouds,” in Proceedings of the Euro-Par Parallel Processing Workshops (Euro-Par 2020), ser. LNCS, vol. 6586. Springer, 2010, pp. 305–313. +[6] R. Buyya and M. Murshed, “Gridsim: A toolkit for the modeling and simulation of distributed resource management and scheduling for grid computing,” Concurrency and computation: practice and experience , vol. 14, no. 13-15, pp. 1175–1220, 2002. +[7] R. N. Calheiros, R. Ranjan, A. Beloglazov, C. A. De Rose, and R. Buyya, “Cloudsim: a toolkit for modeling and simulation of cloud computing environments and evaluation of resource provisioning algorithms,” Soft- ware: Practice and experience, vol. 41, no. 1, pp. 23–50, 2011. +[8] M. Matskin, S. Tahmasebi, A. Layegh, A. H. Payberah, A. Thomas, +R. Nikolov, and D. Roman, “A survey of big data pipeline orchestration tools from the perspective of the datacloud project,” vol. 3036, 2021. +[9] N. Nikolov, Y. D. Dessalk, A. Q. Khan, A. Soylu, M. Matskin, A. H. Payberah, and D. Roman, “Conceptualization and scalable execution of big data workflows using domain-specific languages and software containers,” Internet of Things, vol. 16, p. 100440, 2021. +[10] J. Liu, S. Lu, and D. Che, “A survey of modern scientific workflow scheduling algorithms and systems in the era of big data,” in Proceedings of the IEEE International Conference on Services Computing (SCC 2020). IEEE, 2020, pp. 132–141. +[11] T.-P. Pham, J. J. Durillo, and T. Fahringer, “Predicting workflow task execution time in the cloud using a two-stage machine learning approach,” IEEE Transactions on Cloud Computing, vol. 8, no. 1, pp. 256–268, 2017. +[12] S. Iatropoulou, P. Petrou, S. Karagiorgou, and D. Alexandrou, “Towards platform-agnostic and autonomous orchestration of big data services,” in Proceedings of the IEEE Seventh International Conference on Big Data Computing Service and Applications (BigDataService 2021). IEEE, 2021, pp. 1–8. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt b/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt new file mode 100644 index 0000000..a47daf1 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/48-Poc testing analysis of big data products.txt @@ -0,0 +1,58 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt b/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt new file mode 100644 index 0000000..6945d9f --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/5 - Analysis_on_the_Quality_Model_of_Big_Data_Software.txt @@ -0,0 +1,141 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Analysis on the Quality Model of Big Data Software + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +Xijiao Xu +Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center +Shanghai, China xxj@sscenter.sh.cn + Jiayu Gong +Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center +Shanghai, China gjy@sscenter.sh.cn + Huanming He +Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center +Shanghai, China hhm@sscenter.sh.cn +Wei Song +Shanghai Key Laboratory of Computer Software Evaluation. Shanghai Computer Software Technology Development Center +Shanghai, China songw@sscenter.sh.cn + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +Abstract—With the rapid development of the big data system, The big data system has the characteristics of large data scale, diverse data and high computational complexity. Its testing method has to be constantly improved. By analyzing the general software quality model, and combining the characteristics of the big data software, a set of quality model for the big data software is formed. +Keywords—Big Data ,the Quality Requirements ,Software Model +I. INTRODUCTION +The rapid development of the Internet has given birth to a large number of new frontier technologies. The big data is a hot emerging industry in recent years. The Internet has created a large-scale application environment for the big data technology, which first originated from the Internet. The Internet provides the most important data foundation for the big data. The analyzing and processing capabilities of the big data also bring more developing possibilities for the Internet +companies. In this article,The big data system is defined to centrally store big data resources, meet the high concurrency, mass data requirements for high-performance computing and large-capacity storage capabilities, and provide the ability of the data collection, The big data systems defined in this article is to centrally store big data resources, meet the high concurrency, mass data requirements for high-performance computing and large-capacity storage capabilities, and provide a large amount of openness such as data collection, data calculation, data storage, data analysis, and data visualization. Ability, the data calculation, the data storage, the data analysis, and the data visualization. +As a new application technology, the big data system carries the core business of the platform frequently, so the comprehensive testing and evaluating of the big data system is particularly important. However, due to the characteristics of the big data, its testing methods are different from the traditional software test. The evaluated model of the general software quality ,which is used in the big data system, cannot reflect the characteristics of the big data system such as large data scale, diverse data, high computational complexity, and +distributed structure. This paper will establish a set of software quality model for the big data system to provide reference for the test and evaluation of the big data system, from the perspective of software quality evaluation model and combining with the big data system evaluated examples. +II. THE EVALUATED MODEL OF THE SOFTWARE PRODUCT QUALITY MODEL +Software products have different quality requirements from the perspective of different users. Users consider that the software is easy to use, easy to learn, flexible and user-friendly as the high-quality software. Product managers consider that the software is easy to maintaining, easy to modifying, and easy to developing because of thinking about the product marketing competitiveness. Developers usually consider the software’s complexity and importance as the important indicators of the software quality. So it has great significance to establishing the software quality standard, which is beneficial to improving the product’s software quality. +At present, the general software quality standards widely used and recognized in the industry are ISO/IEC 25023:2016[1~2]. The software products’ quality evaluated model includes ISO/IEC 25051 software quality model[3]. In this model, the software quality characteristics are defined as functional suitability, performance efficiency, compatibility, usability, reliability, security, maintain-ability and portability. These quality characteristics can be used as the general software quality metrics, but the quality of the big data system cannot be measured. +The difference between the big data systems and the traditional systems is storage, mainly about the database storage and the file storage. The searching engine companies were the first to feeling the technical challenges of the massive amounts of data. Subsequently, the rise of the social media sites and the mobile Internet aggravated this challenge. The Internet companies find that the growth, the diversity, and the processing timeliness requirements of the new data cannot be dealt with by the traditional databases and business intelligent vertical scaling architectures. Because the traditional database is designed to capturing data, if you directly get data from it for analysis, there will be many problems, such as complex + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + + This work was supported by National Key R&D Program of China (No. 2018YFB1403404). +978-1-6654-1893--5/21/$31.00 ©2021 IEEE 78 +ICIS 2021-summer, June 23-25, 2021, Shanghai, China + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +structure, messy data, missing history, slow query when the amount of data is large, etc. At this time, you need a "data warehouse ". As a result, the distributed file system—— Google File System (GFS) was first proposed, the distributed computing system and the distributed database solved the predicament faced by the big data with the lower cost and laid the foundation for the flourishing of big data technologies such as HBase, Cassandra, MongoDB, Neo4j and Redis and other databases. The computing processing engine gradually covers scenarios such as offline batch computing, real-time computing, stream computing, and the computing frameworks of MapReduce, Spark, Flink, and Storm are born. In the field of data query and analysis, it has formed a wealth of SQL on Hadoop solutions, massively parallel processing (MPP) architecture, Hive, HDFS, MR, TeraData, GreenPlum and other technologies. The universal system frame diagram of applying big data technology is shown in Figure 1, which contains the common components of the big data system. + +Fig. 1. The system frame diagram for Big Data System +Therefore, according to the characteristics of the big data system, it is necessary to provide more quality measures for its software quality model, and comply with the following principles[4]: +1) Performance efficiency should consider the processing speed, the response time, the resource consumption, throughput, etc. The general performance testing tools are not suitable for the big data system’s measurement, and there are many types of modules in the big data system, also the different modules require the different testing techniques, so multiple testing tools are frequently needed. +2) The testing environment and monitoring plan of the big data system should be considered. The testing environment of the big data system is complex, and +the factors that affect the performance of the big data system are numerous and complicated, including network environment, application, virtualization, data quality, etc., so it is necessary to monitor the entire Cluster machines, services, computing, storage, tasks and other information. +3) The measurability of the quality characteristics should be considered. It should be measured by subjective and objective means, and the cost of measurement should be taken into account. It should be easy to measure and convenient for data collection. The data processed by the big data system has the characteristics of large-scale (Volume), various types (Variety), and fast production speed (Velocity). In the test process of the big data system, the more realistic the test data set is, the more reliable the test results will be. +III. THE EVALUATED MODEL OF THE BIG DATA SOFTWARE QUALITY +Based on the above evaluation principles, and combined with the ISO/IEC 25051 software quality model, a three-tier structure framework is formulated for the test quality evaluated model of the big data system, as shown in Figure 2. In this framework model, the quality factor layer is the eight quality characteristics of the software quality model; the quality sub- elements are the refinement of its upper quality factor layer; the bottom layer is the software quality metric (including various parameters), which is a quantitative software characteristic indicators. For example, the resource consumption mentioned in the article is the software quality metric of resource availability which is attributed to performance efficiency. + + + +Metric Metric Metric Metric Metric Metric +Fig. 2. Quality Evaluated Model +A. Functional Suitability +The functional sub-characteristics of the big data system mainly include data collection, data storage, data analysis, etc. For the big data system, it mainly measures its data analysis and processing function modules, namely data tables or data files. The specific measurement elements include[5-7]: + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +79 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +(1)Verify completed data table, and the table name is consistent with the agreement; +(2) that data table fields are complete, field name, field +type, length precision and other attributes are consistent with the convention; +(3)The primary key of the data table set consistent with the agreement, and the technical constraints are that there are no records with duplicate primary keys and no records with null +primary key fields; +(4) Verify that the time constraint is consistent with the +convention. +data processed by each Executor and the processing time can be viewed by accessing Spark's Web UI interface. The Spark's Web UI interface is shown in Figure 3. + +Fig. 3. The Spark's Web UI Interface + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +B. Performance Efficiency C. Compatibility +Compatibility mainly includes co-existence, verifTy he thsube -plchaatfroracmte risctiomcs poofne pntersf orofm athncee ebiffgic idaenctay smyasintelym interoperability and other aspects. Among them, +including HDFS, HBASE, SPARK, Cloudera and so on. Under interoperability is to evaluate the ability of information transfer each sub-characteristics, the performance testing elements of and interaction between two or more modules. In the big data the big data system mainly include: throughput, data system framework, data providers introduce new data or processing, query response time, etc. The components and information into the big data system; data consumers use metrics is shown in Table 1. applications provided by the big data application providers. There are rich interfaces among the data providers, the data +Table 1 Components and Metrics consumers and the big data application providers, such as the data access interface, the data acquisition interface, the data +Components Metrics HDFS Throught(Read and Write Performance) HBASE Data processing(Read and Write Requests/per second) SPARK Data processing Cloudera The Monitoring Component of Hadoop Platform verification interface, etc.[8]. It requires these interactive interfaces to follow the rules of big data collection and +retention, data access in multiple formats (structured, semi- structured, unstructured), and support for common data +collected tools. +D. Usability +Usability mainly includes learnability, user error protection and so on. The measurement of learnability includes consideration of whether the software presentation documents or the software system helping documents are easy to operate, comfortable and effective. And according to the file, whether the big data system can be easily deployed, or a graphical interface system of the configured tool is provided. User error protection considers whether the system prompts the delete operation when the product software performs the delete operation. +Throughput: Platform IO processing capability is suitable +for HDFS, Hbase and other technologies. The involved tools of E. Reliability +performance analysis include the TestDFSIO tool that comes +with Hadoop and the performance testing tool Yahoo! Reliability mainly includes availability, fault tolerance, CloudServing Benchmark (YCSB), etc.; the database IO easy recovery and so on. For the big data system, under the processing capabilities, such as MPP database, can include above sub-features, the main measured elements are system sequential table scan single node performance, single node data redundancy and data backup strategy. +import and export, and accurate query of tens of billions of System redundancy:Check whether the number of tables. sub-nodes of HDFS, HBase, and MPP components of +Data processing: including the speed of executing queries the big data system is redundant. +or MapReduce jobs, as well as the computing power of the Data backup strategy: Check the number of copies of platform. For example: the spark computing power mainly uses HDFS data‘s settings, HBase, MPP databases’ data aggregate query and Terasort algorithm as performance backup strategy. +evaluated standards. Aggregate query is the task of submitting +aggregate query in Spark cluster, and you can view the amount F. Security +of data processed by each Executor and the processing time by The sub-characteristics of information security mainly visiting the Spark's Web UI interface; Terasort algorithm include confidentiality, non-repudiation, authenticity, evaluation is also in the Spark cluster. By running the TeraSort data security etc. +tool, the generated random data is sorted, and the amount of +80 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + + Confidentiality:User access rights of the big data +system includes the configuration of roles and users in the unit of system components, according to the granularity of data table level and data column level to assign permissions to users; + Non-repudiation: the operation log of the big data +system cannot be modified or deleted; + Authenticity : identity authentication mechanism, +check the identity authentication method, password complexity requirements and login of users by the big data system. + Data Security:check whether the system provides +data storage encrypted and decrypted functions; sensitive data is encrypted transported. +G. Maintain-ability +Maintainability mainly includes analyzability and modifiability. The analyzability’s elements are to confirm the installation and deployment of the big data cluster nodes and the data nodes, and to view the version information of the system. Modifiability is mainly to check the system's online upgrade function and data update mode. +H. Portability +The sub-characteristics of portability includes adaptability and installability. The adaptability’s metric is to confirm the operating system, database, browser that the big data system is adapted to. Installability is mainly check whether the managing node and data node of the big data cluster can be installed. +suitable for big data system , compared with the general software quality model for analysis. It is hoped to provide reference for the big data platform test and improve the quality of the big data software. +REFERENCES +[1] ISO/IEC 25010:2011 “System and software engineering—Systems and software quality requirements and evaluation(SQuaRE) Part 10: System and software quality models”; +[2] ISO/IEC 25023:2016“ Systems and software engineering—Systems and software Qualitu Requirements and Evaluation(SQuaRE)- Measurement of system and software product quality” ; +[3] ISO/IEC 25051:2014 “System and software engineering——Systems and software quality requirements and evaluation(SQuaRE) Part 51:Requirements for quality of ready to use software product (RUSP) and instructions for testing”; +[4] Yuyu Yuan. Practical quality model for evaluating software products. Computer Engineering, 29(5):32-34, 2003; +[5] GB/T 38673—2020 “Informantion technology ——Big data——basic requirements for big data systems(Chinese)” ; +[6] ISO/IEC 25024:2015 “Systems and software engineering — Systems and software Quality Requirements and Evaluation (SQuaRE) — Measurement of data quality”; +[7] ISO/IEC 25012:2008 “ Software engineering — Software product Quality Requirements and Evaluation (SQuaRE) — Data quality model” ; +[8] GB/T 38672—2020“Information technology ——Big data——Interface basic requirements(Chinese)”. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. + +IV. CONCLUSION +By analyzing the characteristics of big data software, this paper has formed a set of software quality requirements system +81 +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Authorized licensed use limited to: UNIVERSIDADE FEDERAL DO CEARA. Downloaded on April 11,2024 at 17:30:53 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt b/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt new file mode 100644 index 0000000..b2ba614 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/60-Regulatory_Mechanism_of_Financial_Market_Resource_.txt @@ -0,0 +1,196 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Hindawi +Mobile Information Systems +Volume 2022, Article ID 4339456, 12 pages https://doi.org/10.1155/2022/4339456 +Research Article +Regulatory Mechanism of Financial Market Resource Management Driven by Big Data +Wangsong Xie 1 and Jianjun Cao2 +1Business School, Wuxi Taihu University, Wuxi 214064, Jiangsu, China +2Human Resources Department, Wuxi Taihu University, Wuxi 214064, Jiangsu, China +Correspondence should be addressed to Wangsong Xie; xiewangsong@126.com +Received 15 April 2022; Revised 31 May 2022; Accepted 23 June 2022; Published 30 July 2022 Academic Editor: YangGao +Copyright © 2022 Wangsong Xie and Jianjun Cao. is is an open access article distributed under the Creative Commons AttributionLicense, which permitsunrestricteduse, distribution, andreproductioninanymedium, providedthe originalworkis properly cited. +In order to further understand the current situation of the financialmarket and better supervise the resource management of the financialmarket, combined with big data and cloud computing technology, through the construction of big data cloud platform resource management system and the integration of various technical computing frameworks, we can realize the effective supervision of big data resources in the financial market. Using J2EE technology, this paper analyzes, designs, implements, and tests the investment data management system, analyzes the content of the software engineering subject, and obtains the demand function description of the business. According to the software development process and the actual situation of enterprise investment, this paper expounds the basic requirements of the investment data management business, system architecture requirements, user use case status, and the operation and configurationenvironment of the investment data management system. + ispaperanalyzesthetechnicalcharacteristicsandoperationindicatorsofthesoftware,andestablishesthedataflowforthedata related to investment data management, such as information statistics, data query, information classification and so on. Finally, thesystem isverified,operatedand tested,and thebusiness usecases andparameters ofthesystem aretestedaccordingtothetwo indicators of software testing. e basic functions of the investment data management realized by the system are correct, the design is reasonable, the operation is stable, the operation response time is short, the operation accuracy is high, and the data access efficiency is good. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +1. Introduction +Today, with the advent of the information network era, data and information are becoming more and more important, especially for all areas of life. e understanding of big data directly affectsthe development of an enterprise or industry. With the advancement of communication and dataization, the integration of financeand big data industries in the new economic era is crucial. e emergence and continuous improvement of big data can increase the transparency of financialmarkets. With the help of new technologies such as big data and cloud computing, financial services can dis- cover more important and useable data from big data and enhance this data to promote the health of the financial system. At the same time, big data can support research on Internet business management and financial markets, help +financial markets achieve greater influence, better avoid business risks, and improve the performance of financial service businesses [1]. However, with the continuous in- crease of financial market resources, especially the fact that more and more idle funds of the public are handed over to financial institutions for asset management, the supervision of financial institutions is becoming more and more im- portant. Under the dual influence of internal and external regulatory policies and regulators, the financial market urgently needs to strengthen the construction of resource management and supervision mechanism, as shown in Figure 1. Based on this, the article combines big data and cloudChinatechnologytoachievebettermanagementofbig data in the finance industry and maintain multi-inclusive management and integration by creating a big data cloud platform experience. At present, the research and discussion + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +2 +A collection of Portfolio +investment Fund manager investment +Investors a Securities a Investors b Fund Securities b +Investor’s c Fund trustee Securities c Figure 1: Financial market resource management. +mainly focus on restricting the investment of asset man- agement business in nonstandard business. e system re- cently introduced at the regulatory level also reflects the opinions and clear attitude of standardizing nonstandard asset investment [2]. At present, the development trend of the financial industry is mixed operation and financial in- novation. Nonstandard assets have played an important role in activating the financial market, enriching financial in- struments and serving the investment and financing of the real economy. e return to simplicity can only be relative, and the return to simplicity of financial derivatives is completely inconsistent with the reality of development. +2. Literature Review +Huanget al.[3] studied theinvestment system of enterprises and made some achievements in the research process [3]. Sultanaw et al. [4] put forward the theory of “reference design model” for the investment management system in South Korea. e theory adopts a strategic way to sort and manage the investment information, and handles the in- formation security problems in the task of the management system through effectivemeans. It forms a unique theory for the actual investment management system [4]; Phi- boonbanakit and Horanont [5] solved the demand analysis of investment management system, improved the quality of system analysis report from the aspect of reliability, com- bined analysts and business personnel, and eliminated some obstacles between them [5]. Qu [6] believed that the essence of the model is based on the “cooperation mechanism.” Process capital analysis can solve existing problems and solve problems in investment management level assessment from the perspective of cooperation and collaboration [6]. Yan et al. [7] said thatthe investment management system is carried out around services, through high-quality services, shaping and strengthening a good public image of invest- ment, creating a favorable public opinion environment, striving for favorable investment policies, and finally real- izing the long-term development of investment manage- ment [7]. Watson et al. [8] believed that the investment management platform, as an important part of digital +Mobile Information Systems +investment, is a scientific management guarantee for real- izing investment, involving all links and multi-level com- prehensive application of investment management. e investment management system with scientificmanagement asthecore,effectivelysupportstheimplementationofdigital enterprises, improves the management efficiency of enter- prise parks, and becomes an irreplaceable platform for in- vestment management of enterprises [8]. Hyers [9] said that for capitalist countries, the main goal of market supervision is simple and clear, that is, to maintain market order by relying on mandatory laws, systems and norms, and its market supervision behavior is controlled by the nature of capitalism. erefore, with the development of capitalist market and the change of government functions, there are various studies on market supervision [9]. For example, Connolly Barker et al. [10] believed that market regulation is the comprehensive control of various factors in the market by the government in order to ensure social stability and sustainable economic development, to standardize market behavior, and to ensure orderly operation of the market and maintain stable economic development [10]. Keane et al. [11] said that market regulation is a passive government behavior. Since the market cannot spontaneously maintain good order, the government needs to participate in regu- lation. erefore, market regulation must have mandatory elements. With the continuous development of the market, the market supervision implemented by the government must achieve dynamic follow-up, that is, the government supervision can meet the needs of market development [11]. Guan et al. [12] believed that if the market supervision implemented by the government cannot meet the needs of the current market, it will lead to the lack of supervision in some supervision and many problems; although the gov- ernment’s market supervision comprehensively includes market factors, if the supervision is too frequent, or even the supervision strength exceeds the market bearing capacity, it will restrict the benign self-development of the market to a certain extent [12]. Maddumala et al. [13] said that the characteristic of market supervision is that functional de- partments not only supervise in accordance with relevant lawsandregulations,butalsomanageallaspectsandlinksin the market. Due to the characteristics of socialist economy, the government also supervises its own market behavior to comprehensivelyensurethestabilityandorderofthemarket [13]. +Based on this research, this paper proposes a regulatory mechanism based on big-data-driven financial market re- sourcemanagement.Inthispaper,usingtheJ2EEtechnique, analyzed, designed, implemented, and tested the investment data management system, to analyze the content of the software engineering project, get the business requirements function description, based on the software development process, according to the actual situation of enterprise in- vestment, the basic requirements of the investment data management business, the system architecture require- ments, the status of the user use case are expounded. For the operation and configurationenvironment of the investment data management system, the technical characteristics and operation indexes of the software are analyzed, and the data +Mobile Information Systems +related to investment data management, established the data process, such as information statistics, data query, infor- mation classification, and other contents, at last, verify the running and tested the system, according to the two aspects of the software testing indicators, service case and param- eters of the test system. e basic functions of the system are correct, with reasonable design, stable operation, short operation response time, high operation accuracy, and good data access efficiency. e test results show that the in- vestment data management system of the investment en- terprise operates normally, and the various operating parameters of the software meet the design requirements and software engineering standards. +3. Design of Supervision Platform for Financial Market Resource Management +3.1. System Functional Requirements.According to the construction objectives, the basic functions of the invest- ment data management platform are shown in Figure 2 below. +(1) Design the enterprise basic information manage- ment module, the main functions are: manage the basic situation of the enterprise, list statistics of subordinate enterprises, and manage the basic business of the enterprise; +(2) Management and investment project information module: manage high-risk financial investment projects, foreign investment projects, and fixedasset investment projects; +(3) e investment summary and analysis module in- cludes enterprise basic information summary, for- eign investment project summary, and fixed asset investment project summary; +(4) Management of investment implementation: quar- terly progress of major projects, annual imple- mentation of projects, annual implementation of fixed asset investment projects, foreign investment projects, and high-risk financial investment; +(5) Statistical risk data, investment risk management module shows the risk of investment projects; +(6) e system login module provides user login. At the same time, only the system administrator can add, modify, and delete business operators. e system administrator can only add from the database [14]. +3.2.SystemUseCaseStatus.Use case diagram is a key factor in the software development engineering. It reflects the relationship between all users and system business functions in a system. e drawing of use case diagram will clearly reflecttheoperationpermissionsofdifferentusers,asshown in Figure 3. + e administrator of the investment data management +system can handle the following businesses in the system: managing investment risk, managing investment project information, managing enterprise information, managing +3 +system data, managing investment execution, user login, investmentsummary,and analysis,etc., einvestmentuser of the investment data management system can handle the following businesses in the system: management of invest- ment risk, management of investment project information, management of enterprise information, management of investment execution, user login, investment summary analysis, and other permissions [15]. +3.3. System Data Flow Requirements +3.3.1. Top Level Data Flow.As shown in Figure 4, the top- level data flow is designed to display the data interaction process and reflecttheinvestmentdata managementsystem. e main business data processed are: investment execution data, project risk basic data, enterprise basic data, invest- ment project data, and user basic data. e data flow fully shows the flow direction of system design. +3.3.2. Query Data Flow. As shown in Figure 5, the data information of the investment data management system for investment enterprises mainly deals with the query data, including project risk data, investment department data, system user data, and investment execution data. rough the query flow chart, the final query flow direction of the investment data is the storage table of the database, which is themainfeatureofaninformationmanagementsystem[16]. +3.3.3. System Login Data Flow.AsshowninFigure6,theuser login process of the investment data management system is established, and the window provided for user login is dis- played on the operation interface. In the test process, input their own login information first. After confirming that the information is input correctly, operate the “login” button below. e interface program will analyze whether the user informationexistsandverifytheiruseridentity. etestshows that if the login information is operated correctly, the main interface of the investment data management system will be opened,otherwise,theinterfacewitherrormessagewillappear. +3.4. Overall System Design +3.4.1. Network Structure Design.Since the design should meet the actual needs, the solution of the investment data management system of the investment enterprise should realize the management and analysis of the investment data management information when designing the investment data management system, and the selected network equipment should meet the requirements. is is a relatively advanced model in the industry and is composed of the data network system [17]. e manager manages the data in the database.Forthenetworkproductswidelyusedintheworld, when selecting the products of internationally well-known manufacturers and designing the network equipment of the investment data management system, the principle of safety, stability, and reliability shall be followed to ensure the smooth implementation of investment data management. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +4 Mobile Information Systems +Functional structure of financial investment data management system +Manage +Enterprise basic Investment Investment Investment +Investment System information summary Execution Risk +Project login management analysis Management Management +Information module module module Module Module +Module +Figure 2: Functional structure of financial investment data management system. +data management system User login +System data management +Enterprise Information Management +investment project management +Enterprise administrator investment user +Investment summary analysis +Investment execution +Investment Risk Management +Figure 3: Use case diagram of financial investment data management system. +Investment +Investment Execution Investment +Corporate project Information Risk +Information information Information User Info +data exchange +Figure 4: Top level data flow diagram of financial investment data management system. +Mobile Information Systems 5 +Teaching information +Laboratory Information +query +Data query data +processing entry +Personnel information +Instrument and equipment information +Figure 5: Data flow diagram of data information query. +physical enter the input Check Compare perform Complete Enter the system +login system main +Certification databases login +verification page +Figure 6: System security access data flow diagram. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + + e investment data management business data takes the front-endswitchasthebufferlibrary,integratesthedatainto the central database through the data exchange platform, accesses all hosts to the server in the internal LAN, and accesses the system with the external Internet. VPN tech- nology can be used on the Internet. For users without an external network, the data center is deployed on the external network of the enterprise. e resources of the investment data management data center can be accessed safely through theInternetnetwork,andtheusersofthenetworkcanaccess in the same network [18]. e remote control of the client can be realized through the network data exchange. e investment data management system of the investment enterprisecanactivelyinitiatetheconnectiontothenetwork and has the wired communication function between the server and the client. It can obtain the current system status oftheclientandthedataoftheinvestmentdatamanagement businessinrealtime,soastorealize thecontrollabilityofthe whole investment data management information trans- mission process. +3.4.2. System Function Structure Design +(1) First, Software Data Layer. Data layer maintenance is the application-oriented data existing in the system. rough the storage medium, the system-related information is stored in a certain medium and saved in a regular way. e +upper end of the system can carry out various effective operations on the information in the database through the program software, so as to achieve the business function, data storage, and data access of the client of the investment management system. Its main core operation is the input and output of data. If these two points are handled well, the business function of a management system can be handled accurately [19]. In the investment data management system studied in this paper, various tables of relevant data are stored in the database environment. e client can call and access the information of enrollment management, plan management, personnel management, and so on. +(2) Second, Software Middle Layer. In the investment data management system of investment enterprises, in addition to the traditional data storage mode, the database access middleware technology is also designed and used. A layer of middlewaresystemisdesignedbetweenthedatabaseandthe logic layer. Its main function is to quickly connect the business layer and the database. rough the connection of this interface, the encapsulated function events will be called when the data is input and output, which reduces the programming of the program end. It also improves the data transmission efficiency and realizes stable high-level appli- cations in the process of communication interaction. It is of great value for maintaining, transplanting, and upgrading the management system in the future expansion [20]. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Mobile Information Systems +9 +3.4.3. Software Presentation Layer.In the business layer, the interface of software client is designed and developed through J2EE technology, and the operation code is pro- grammed. According to the design of investment data management module, the management function is designed in detail. According to the business needs, the enterprise network is established: investment summary and analysis module, investment project management module, invest- ment risk management module, investment execution module, data management module, user login module, enterprise information management module, etc., As shown in Figure 7. +3.4.4. Risk Management Module. e design of investment project risk management function is shown in Figure 8. By analyzing the risk data existing in the implementation of the investment project, the risk problems that have been han- dled can be updated and deleted. e system user can add, view, analyze, and process the risk data of the investment project. e function of data binding, display, management, and maintenance of investment project risk realizes the maintenance of the investment risk data. Realizethedataupdate,asshowninFigure9.Executethe update operation, enter new data in it, and update the data through the inputable dialog box after completing the input. According to the security strategy of hierarchical pro- tection and combined with the characteristics of manage- ment business, the community management system should be divided according to the construction of security pro- tection system of each security domain, external network platform domain, and internal network platform domain +[21]. e terminal machine room shall ensure safety and security: fire prevention, anti-theft, dust prevention, wa- terproof, anti-static, and anti-power failure. e security system design of the investment data management system followsthesecuritysystemmodel.Undertheguidanceofthe unifiedhierarchical protection security strategy, the security system design of the whole online management platform is divided into several important contents, such as the con- struction of security technology security system, emergency response system, and security management security system. e construction of security technology guarantee system includes security infrastructure (including unified authen- tication, password service system, trusted timestamp service system, etc.,), and security service system (monitoring and detection system, etc.,). e construction of emergency response system includes emergency response objects, processes, institutions, and other aspects. e construction of safety management guarantee system includes organi- zation, system, management means, safety audit, and so on. +4. Key Technologies of Resource Management for Big Data Drive +4.1. Big Data Platform Computing Framework. ere aremany computing frameworks for different scenarios of big data processing, including MapReduce parallel computing model, spark memory computing framework, and some +streaming computing frameworks. MapReduce parallel computing model is mainly used in large-scale batch com- puting scenarios. Due to its poor performance in iterative algorithms, spark memory computing framework appears. Spark memory computing framework greatly improves the performance of data mining and machine learning algo- rithms [22]. e streaming computing framework mainly dealswiththeapplicationscenarioswithstrongreal-timeand interactive requirements. Different computing frameworks havetheirownadvantages.Alarge-scalesystemoftenfacesa variety of application scenarios, and a variety of computing frameworkscanplaytheirrespectiveroles. ispapermainly uses MapReduce parallel computing model. Traditional parallel computing models include data parallel model and messageparallelmodel,dataparallelmodelssuchasHPFand message passing models such asMPI and PVM.Whenusing the traditional parallel computing model to write programs, users need to intervene in the division of data and the syn- chronization of tasks and the burden of programmers is heavy. In order to reduce the programming difficulty of parallel processing massive data, MapReduce program can run on a cluster composed of cheap commercial machines because it does not care about the performance of a single node and has high fault tolerance [23]. MapReduce parallel computingmodel shields thedetailedimplementationofthe underlying parallel program. Users only need to use map function and reduce function to define their own business processing logic, which is simple and easy to learn, freeing programmers from the heavy burden of traditional parallel programming model, and greatly promoting the develop- ment of massive data processing and analysis ability. +4.2. Joint Optimization of System Resources +4.2.1. Virtual Machine and Physical Server Model. is paperassumesthatCPprovidesatotalofKdifferenttypesof VMs,wherek∈k:�{1,2,..., K}representsthektypeofVM. Each type of VM is preset with differenttypes and quantities +of resource requirements, such as CPU, memory, and hard disk, and g(k) is used to represent the demand for VM +resources of type k. In addition, this chapter assumes that there are m physical servers in the DC, and the resource capacity of each physical server m∈M:�{1, 2,..., M} is +denoted by c (m). +4.2.2. Virtual Machine Request Model.It is assumed that there are a total of H differenttypes of VM requests arriving, and each request type h∈H corresponds to different types and quantities of VMs. At the same time, this chapter as- sumesthatthenumberofdifferenttypesofVMsrequiredby each VM request is randomly distributed and independent of each other, and uses r (l, k) to represent the number of VMs of typekrequired by VM request l. erefore, the total resource requirement of VM request l can be expressed by formula (1): +rl � 􏽘r(l,k)g(k). (1) +k + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Mobile Information Systems + +System front desk +middle layer +System background +Network Public Opinion Database + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Mobile Information Systems + +Figure 7: Overall functional architecture of the system. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Mobile Information Systems +11 +Start +Investment execution +no +Is there a risk +yes +Display risk data +end +Figure 8: Risk management operation process. +4.2.3. Income Model.Usually, the CP will bring certain benefits for each VM request it receives. is chapter as- sumesthatinstantiatingaVMoftype kcanbring p(k)toCP per unit time. Although the CP can actively reject some VMrequests so that there are enough remaining resources to accommodate subsequent VM requests with higher revenue value, rejecting VM requests will still bring certain negative impacts to it, such as affecting its reputation, etc., [24] erefore, this paper introduces a “penalty” mechanism to characterize the indirect loss caused when the CP rejects a VM request, and uses φ (k) to represent the unit time loss caused by the CP rejecting a VM of type k. us, the actual benefitthat CP obtains from VM request l can be expressed by (2) and (3): +R(l) � 􏽘ρ(k)r(l,k)τ(l). (2) +k +means l is accepted +Start +Enter new information +no Is the input data +canonical? +yes +Execute update function +Data Update +end +Figure 9: Risk data update operation flow chart. +R(l) � −􏽘ρ(k)r(l,k)τ(l). (3) +k +means l is rejected. +4.2.4. Virtual Machine Request Joint Optimization Decision Making Problem. e core problem of the joint decision optimizationofVMaccesscontrolandresourceallocationis to design a strategy that can evaluate the impact of the current resource allocation decision on the future resource + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Mobile Information Systems +21 +π sl􏼁� π( slsup) ∈A( sl)⎧⎪⎨⎪⎩R sl,π sl􏼁􏼁+ c sl􏽘+1∈SP sl + 1|sl,π sl􏼁􏼁Vπ sl + 1􏼁⎪⎭. (9) +⎫⎪⎬ + e strategy obtained by the above formula is the op- +considered, when any VM request l reaches the DC and the CP adopts the decision, the conditional state transition probability of the system in the case of the next random event can be expressed as three cases by the following formula, as shown in formulas (10)–(12): +timal decision π∗(s1) corresponding to each state. +Any VM request can arrive and any VM request can leave. Since this paper assumes that the decision of any VM request is determined when it arrives, the state of the system will not change at the middle time of two adjacent random +utilization and the potential benefits of CP, so that the comprehensive optimization decision that is the most conducive to improve the long-term benefits of CP can be selected for the currently arrived VM requests. erefore, under the joint optimization strategy, for any VM request that arrives, CP needs to consider whether it needs to be acceptedand how toallocateresources toit afteracceptance, and judge the probability of resource blocking or resource wastebyquantitativelyevaluatingtheimpactofthisdecision on subsequent decision-making. Maximize the benefits of the final decision [25]. + e goal of VP problem is to design an optimal decision function π∗, so as to maximize the expected discounted +revenue (EDR) of CP in a long time, as shown in (6): +maxRπs0 � Eπs0⎧⎨⎩􏽘∞ Rl sl,π sl􏼁􏼁ctP s + 1|s ,a 􏼁� λh + 1 � h,s + 1 � s + a (10) +events. erefore, CP nly needs to make corresponding decisions on the VM request when it arrives. us, the state transition probability of the system can be defined as the probability that the next random event is the arrival of VM request or the departure of any deployed VM request under a given system state and its corresponding decision. Since the resource reallocation of deployed VM requests is not +l l l λ sl,al􏼁,pl l l l, +P s + 1|s ,a 􏼁� nh′μh′ + 1 � 0,s + 1 � s + a −ah′ +l l l λ sl,al􏼁l l l l l′ , +,p +(11) +⎭. (6) +⎫⎬ +l�1 + e joint optimal strategy of virtual machine access control and placement can be expressed as (7): +π∗ � argmaxRπs0, π ∈II. (7) + +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt b/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt new file mode 100644 index 0000000..e8b1bc5 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/62-A systematic quality assurance framework for the upgrade of radiation oncology information systems.txt @@ -0,0 +1,188 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Physica Medica 69 (2020) 28–35 +Contents lists available at ScienceDirect +Physica Medica +journal homepage: www.elsevier.com/locate/ejmp +Original paper +A systematic quality assurance framework for the upgrade of radiation +T oncology information systems +Baoshe Zhang ⁎, Shifeng Chen, Warren D. D’Souza, ByongYong Yi +Department of Radiation Oncology, University of Maryland School of Medicine, Baltimore, MD 21201, USA A R T I C L E I N F O A B S T R A C T + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Keywords: +Quality assurance +Radiation oncology information system Clinical data integrity and safety Radiation oncology data management Integrated oncology system +In spite of its importance, no systematic and comprehensive quality assurance (QA) program for radiation on- cology information systems (ROIS) to verify clinical and treatment data integrity and mitigate against data errors/corruption and/or data loss risks is available. Based on data organization, format and purpose, data in ROISs falls into five different categories: (1) the ROIS relational database and associated files; (2) the ROIS DICOM data stream; (3) treatment machine beam data and machine con figuration data; (4) electronic medical record (EMR) documents; and (5) user-generated clinical and treatment reports from the ROIS. For each data category, this framework proposes a corresponding data QA strategy to very data integrity. This approach verified every bit of data in the ROIS, including billions of data records in the ROIS SQL database, tens of millions of ROIS database-associated files, tens of thousands of DICOM data files for a group of selected patients, almost half a million EMR documents, and tens of thousands of machine con figuration files and beam data files. The framework has been validated through intentional modi fications with test patient data. Despite the big data nature of ROIS, the multiprocess and multithread nature of our QA tools enabled the whole ROIS data QA process to be completed within hours without clinical interruptions. The QA framework suggested in this study proved to be robust, ffie cient and comprehensive without labor-intensive manual checks and has been im- plemented for our routine ROIS QA and ROIS upgrades. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +1. Introduction +With the advancement of computer technology and the transition +from paper-based medical records to electronic medical records (EMRs) +[1 3], radiation oncology information systems (ROISs) [4] have be- come increasingly complex and data-intensive. Their functionalities +have been extended from a simple record-and-verify system [5] to a comprehensive radiation oncology patient care system with numerous subsystems, such as patient image storage, patient demographics, treatment scheduling, treatment delivery and records, follow-up visits, and even treatment planning. ROISs are playing a pivotal role in im- proving patient care regarding e fficiency and safety [4] , as well as re- ducing the error rate in the clinic [2,6,7]. However, a ROIS, as an emerging complex technology, may face new challenges and introduce +a new venue for errors [6,8]. Therefore, quality assurance (QA) issues for ROISs have been raised in the radiation oncology community [7,9]. +There are occasions that can put ROISs at high risks, such as, a software upgrade or hardware change [10], which might be in company with database migration. Because of the complexity of patient data and +hybrid database storage architecture, database migration is becoming +much more complex and risky. A clinical ROI system provides treat- ment parameters (such as gantry angle, collimator angle, couch angle, jaw position, multileaf collimator position, monitor units, etc.) to a treatment delivery system (such as linear accelerators) and then records all treatment histories and activities. If any of the treatment parameters is accidentally modi fied in the database during the ROIS upgrade, treatment will deviate from the intended plan, with consequences that could harm patients and/or lessen treatment e ffectiveness. An intensity- modulated radiation treatment/volumetric-modulated arc therapy plan might include thousands of treatment parameters, so that it is almost impossible to check these manually as was done in the past. Despite vigorous software QA by the vendors of ROISs before the release of a new version, it is still the responsibility of clinical physicists and IT group members to check and con firm their own data integrity. As a type of medical device, ROISs deserve a comprehensive QA method like any other equipment in radiation oncology. However, few how-to instruc- tions or recommendations for ROIS QA methods have been published [13]. Therefore, it is crucial to perform a series of QA for checking consistency during a ROI upgrade and the QA procedure should be automatic for a practical reason. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +⁎ Corresponding author. +E-mail address: bzhang4@umm.edu (B. Zhang). +https://doi.org/10.1016/j.ejmp.2019.11.024 +Received 17 March 2019; Received in revised form 8 November 2019; Accepted 26 November 2019 1120-1797/ © 2019 Associazione Italiana di Fisica Medica. Published by Elsevier Ltd. All rights reserved. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +B. Zhang, et al. Physica Medica 69 (2020) 28–35 +This article presents a systematic QA framework for veri fication ofROIS information integrity after a signi ficant change happened to ROIS, such as ROIS software or hardware upgrades or data migrations. +2. Methods and materials +This framework mainly focuses on clinical data sources and struc- +tures in ROIS. All data are categorized into five kinds: the ROIS SQL [11] database and its associated files, ROIS DICOM [12] data streams, ROIS machine data files and con figurations, EMR documents, and clinical reports generated from the ROIS. The principle of the QA fra- mework compares these five data sources and data structures between ROIS states. Once data integrity is veri fied, an end-to-end test is per- formed to further check connections and interfaces between the ROIS system and other clinical systems (such as treatment planning systems, treatment control consoles, and hospital information systems). +2.1. ROIS relational database +From time to time, due to performance improvements, security concerns, or bug fixes, a ROIS relational database (see Appendix I for details) system would be upgraded. Sometimes, it involves data mi- gration. Usually, data migration occurs in the following situations but +not limited to: (1) the vender strategically changes partnership with commercial database software companies or simply adopts a new da- tabase server architecture based on performance and features; (2) the vendor simply adopts a new hardware and relocates data from a legacy storage to a new data storage, or from a server to another; (3) the vendor redesigns their database schema and architecture and needs to move data from the legacy databases to the new databases. During ROIS upgrades, possible data risks include implicit data loss and explicit data +loss, data corruption, and corrupted data relationships. +In order to verify migrated data in databases, the first step is to compare database schema to figure out how data have been re- structured and migrated from the legacy database to the new database +and how data relationships have changed for example, to identify any added or deleted data columns or tables or any data type change for a +data column. An existing data column may move to a di fferent data table, or a data table or column may be renamed. Moreover, data ag- gregations or data splits may have occurred. Such a database schema change is illustrated in Fig. 1. Here, a new data table C in the new database contains data from tables A and B in the legacy database. This diagram also shows that a data column being moved from the legacy database might end up with a di fferent data column name in the new database. + +Fig. 1. Diagram for database schema change. Data table C is in the new data- base, and data tables A and B are in the legacy database. Data column c1 in data table C contains the same data from data column a1 of data table A, and so on for data columns c2, c3, and c4. + +Fig. 2. Database schema comparison. Here A represents the legacy database, and B represents the new databases. Region (c) represents common data ex- isting in both databases, region (a) represents data removed from B, and region (b) represents new data in B. +According to database schema changes, data comparison between +two states of databases can be implemented by either creating data views or designing complex data comparison statements. In our im- plementation, we used A-B and B-A (A and B are datasets from an SQL query statement for legacy databases and for new databases, re- spectively) to identify di fferences between A and B. In Fig. 2, region (a) represents the data that exist in the legacy database but not in the new database (A-B); region (b) represents newly created data that never existed in the legacy database (B-A) and region (c) represents data that exist in both the legacy database and the new database (A ∩ B). +It is time-consuming and technically challenging to compare big and complex databases. In order to speed up data comparison, concurrent multi-process or multi-thread techniques should be used to process sectional database. A ROIS system might be composed of several da- tabases. Each database might have hundreds or thousands of data ta- bles. Since database servers support parallel data access, each con- current process or thread can handle a portion of a database. For a big data table, its data comparison can be distributed among multiple processes or threads by carefully splitting the data table into multiple sections. +2.2. ROIS DICOM interface +DICOM is a de facto standard in medical fields, including radiation oncology, for patient data exchange and storage, such as exporting radiation therapy (RT) information (e.g., contours, treatment plans, dose distributions of treatment plans, treatment records and radiation therapy images) to a clinic linear accelerator. A ROIS exchanges patient demographic information and radiation treatment information with other radiation oncology systems through DICOM data streams. Although relational databases are the ultimate patient data storage, the information in these databases must be converted into a DICOM data stream before being sent to other systems, such as sending treatment plans to a treatment delivery system. In addition, the ROIS receives information from other systems through its DICOM interface, then converts and stores the information in its relational databases. +DICOM data streams group information into data sets and use three +different element encoding schemes. It has a 2-byte field for informa- tion group specifying information class (such as patient information), a +2-byte field for information element specifying a particular data (such +as patient name), a 2-byte field for data type (such as, ST indicates that the data type is short text.). Further, DICOM uses sequences to create nested data structures to store complex attributes. DICOM stream has some time stamps, such as DICOM object creation time. Therefore, even +for the same DICOM object, two DICOM exports will produce two dif- ferent DICOM data streams. In DICOM data comparison, we only compare essential information instead of comparing every bit contained in DICOM data stream. For example, when two DICOM RT-plan data streams are compared, DICOM object instance creation time will be + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 + +Fig. 3. DICOM interface of ARIA ROIS. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 +ignored but other information (such as plan parameters and referenced structure and referenced patient information and various DICOM un- ique identi fiers) will be compared. +DICOM objects (such as RT-Plan) for a group of selected patients are +automatically exported from the relational databases through the ROIS +DICOM interface and stored in the file system by a DICOM storage server (Fig. 3) for two ROIS states, such as pre- versus post-upgrade. +Then the uniform identi fications (UID) of DICOM service-object pair (SOP) instances are used to pair DICOM files between ROIS states. A DICOM comparison tool will read each data element from a pair of +DICOM files for comparison, and then generate a comparison summary +report (Fig. 4a and Fig. 4b and Fig. 4c). The procedure not only checks +to determine whether the ROIS DICOM interface is working properly but also implicitly veri fies data in the ROIS databases. +2.3. Beam data and machine con figurations +When treatment machines, such as clinic linear accelerators, are commissioned, a set of machine model parameters are generated based on clinical measurements. These parameters are used for beam mod- eling, dose calculation, treatment plan validation, etc. Individual sites might have di fferent preferences in machine settings and con figura- tions. To verify machine data and con figurations, our approach is to generate an MD5 hash string for each data file between ROIS states. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 + +Fig. 4a. Snapshot of a DICOM comparison report. In this instance, all plan parameters and treatment records are identical. + +Fig. 4b. Sample report of DICOM RT-Treatment Record changes. In this instance, treatment records have been changed but the plan parameters are identical. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +31 +B. Zhang, et al. Physica Medica 69 (2020) 28–35 +Then these MD5 hash codes are compared to determine if the machine data files are intact. If machine data changes occur, our approach is to obtain the file format information from the manufacturer to compare data and determine what kinds of changes were made. For example, if machine data are saved in XML, an XML file parser is used to compare changes of critical information. +2.4. ROIS static files and EMR documents +Relational databases usually store big trunks of binary data (such as +images, doses, contours, etc.) as disk files in patient folders. The con- tents of these files are not modi fied frequently during routine practice and are kept intact, as are the contents of EMR documents. Because of +the very large numbers of these files with terabytes of disk storage, it is not practical to generate a separate copy of all these files for each ROI state. Our strategy is to generate an MD5 hash string for each such file between ROIS states and then compare paired MD5 hash strings to determine whether any such file has been corrupted or altered. +2.5. User-generated documents in ROIS +User-generated documents are usually template-based and can be +generated from information in the ROIS relational databases, such as +patient appointments during a period of time, radiation treatment his- +tory, a list of patients under a speci fic treatment protocol, etc. These reports use common file formats, such as Microsoft Excel, Word, or PDF, so that they can be viewed by third-party software. Our approach uses +file parsers to retrieve information from these reports and compare +them between ROIS states to make sure that information in these re- +ports is identical and accurate. In our clinic, comparison of these reports is automatically performed by in-house built Excel, Word, or PDF file parsers. +2.6. Mode-up test and end-to-end test +After data integrity testing, a mode-up test and an end-to-end test +are performed following clinical work flow (Fig. 5). Therapists loaded each treatment beam of the plans for under-treatment patients into the treatment machines to con firm whether the plans are deliverable. The end-to-end test uses a phantom patient and follows the treatment pro- cedures from CT simulation scan to treatment delivery. All treatment records, including captured images and treatment history, are checked. During this entire end-to-end test process, data in each step are +carefully veri fied. The end-to-end test will not only check the essential ROIS software functionalities but also help to con firm the connectivity between ROIS and other clinical systems. +3. Results +The radiation oncology practice at the University of Maryland Medical System includes five photon sites (a main campus and four community practices) and a proton site; and all sites share a single ARIA (Varian, Palo Alto, California, USA) ROIS. Both of the QAs with our novel method following upgrades from version 11.2 to 11.5 in early +2014 and from version 11.5 to 13.7 with the proton modality in late 2016 showed that this framework is reliable and e ffective. +Both ARIA upgrades and QA were performed over a single weekend. Prior to the upgrades, an XML file describing the SQL database schema changes was generated from both the legacy version and the new ver- +sion of ARIA. Once the clinics closed on a Friday afternoon, the QA program generated MD5 hash string for each database-associated file and each EMC document. Another QA program commanded the ARIA DICOM interface to export treatment plans and treatment records for all under-treatment patients. The pre-upgrade SQL databases of the ARIA ROIS were kept for comparison. Physicists, dosimetrists, and therapists generated clinical reports used for routine practice for later comparison. +A copy of machine con figuration files and beam data files of each treatment machine was kept for later comparison. Together, all of these +tasks were completed in 2 3 h. The ARIA ROIS upgrade was then started by the vendor application specialists. After upgrade, the SQL database comparison software started to compare databases table by table and record by record between the pre- and post-upgrade data- bases guided by the schema change XML file of the database. In parallel, the ARIA DICOM interface was commanded to export treatment plans +and treatment records for the same patients as those prior to the up- grade. A DICOM comparison program paired DICOM files according to DICOM Instance UIDs and then compared detailed information between paired DICOM files. An MD5 hash string was generated for each data- base-associated file (such as image file, dose file, contour file, etc) and each EMR document, followed by comparison of corresponding pre-/ post-upgrade MD5 hash strings. Another program parsed machine configuration files between pre- and post-upgrades. Clinical and treat- ment reports with the same criteria were exported from ARIA and compared against their pre-upgrade counterparts. All comparison tasks +were completed on a Saturday. The summary of the comparison results +was presented to the chief physicist or the upgrade QA team lead for + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 + +Fig. 4c. Sample report of DICOM RT-Plan changes. In this instance, plan parameters have been changed but the treatment records are identical. Here, beam type for all treatment beams was changed from STATIC to DYNAMIC. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 +review. When doubts were raised, the vendor s application specialists were contacted for consultation. Should any doubt or suspicion not be resolved satisfactorily, the ARIA ROIS would have been rolled back. Once data QA was performed successfully, the vendor s application specialists came on-site to perform acceptance tests in the presence of local physicists and/or IT personnel. On Sunday, representatives from each functional group, including physicists, dosimetrists, therapists, +and physicians, performed the mode-up tests and an end-to-end test. Once these tasks had been successfully completed and documented, the +new ROIS was o fficially released for clinic use. +In order not to compromise any clinical patient data, test patients +are used. All of the modi fications have been detected and it was pos- sible to identify the sources of di fferences using the reports generated from the QA proves. For instance, a series of parameters of a beam from +a treatment plan has been modi fied, including monitor unit value, collimator angle, couch angle, jaw field sizes, MLC leaf positions, ap- pointment schedule. These changes will result in exported DICOM RT- +Plan changes (Fig. 4b and Fig. 4c and Fig. 6) and will also result in database changes (Figs. 7 and 8). +The system successfully detected true-positive components which have been intentionally added during the upgrade procedure under a test ROIS environment. The error components were a modi fied delivery plan, an altered treatment history, deletion of an image, addition of an electronic medical record and omission of a patient. During the 2014 upgrade, we veri fied 1,638 data tables with 2.4 billion data records, 1.86 million ARIA database static files, and 43,153 EMR documents. For 222 patients under treatment, 605 pairs of DICOM RT plans and 13,480 pairs of DICOM treatment records retrieved from the ROIS DICOM in- terface were compared. 83 new data tables were identi fied. 74 existing data tables had new data columns added, and 4 data tables from the previous version were removed. Meanwhile, two existing data tables +were consolidated into a data table. Reports for 5,073 patient en- counters over a 2-week period were compared and determined to be identical to those before the upgrade. Contents in 12,237 machine files + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +B. Zhang, et al. Physica Medica 69 (2020) 28–35 + +Fig. 5. Clinical work flow for the end-to-end test with a phantom patient. + +Fig. 6. Sample report of DICOM RT-Plan parameter changes. In this instance, multiple plan parameters have been altered. +were compared, and no di fferences were found between pre- and post- 4. Discussions +upgrade states. It took about 2 h for pre-upgrade preparation and about +8 h for post-upgrade QA. Data migration errors in radiation oncology have been identi fied as +During the 2016 upgrade, we veri fied 1,891 data tables with 4.4 emerging issues by the World Health Organization [13] , and ROIS billion data records, as well as 9.45 million ARIA database static files software upgrades or changes have been identi fied as imposing high and 493,034 EMR documents. For 351 under-treatment patients, 1,104 risk [10]. The International Atomic Energy Agency Human Health Re- pairs of DICOM RT plans and 22,046 pairs of DICOM treatment records port No.7 [14] recommended that quality control be performed after were compared. 165 new data tables and 94 amended or deleted tables record-and-verify system upgrades. However, the relevant QA tools are were identi fied. Reports for 8,452 patient encounters over a 2-week far behind emerging technology. Until now, the majority of QA checks period were compared and were identical to those before the upgrade. in ROISs have been performed via manual checks, such as pre-treatment Contents in 26,165 machine con figuration files and beam data files measurements or spot checks [15] . Because of increasing data quantity were compared, with no di fferences identi fied. It took about 3 h for pre- and complexity, such manual checks can assess only a tiny fraction of upgrade preparation and about 8 h for post-upgrade QA. patient data for contemporary ROIS systems with EMR functions. A + +Fig. 7. Sample summary report of database changes. +Fig. 8. Sample report of detailed database table changes. This figure shows two corre- sponding table rows from table + dbo.ExternalField between two ROIS states. Here, RadiationSer represents the primary key of table dbo.ExternalField . All other columns (such as, GantryRtn, CollRtn) +represent attributes of table dbo.ExternalField . Due to space limitations, not all the table columns are listed here. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +34 +B. Zhang, et al. Physica Medica 69 (2020) 28–35 +comprehensive and automated QA tool is imperative for maintaining +and verifying patient data integrity in the era of big data. +Clinical implementations of automated QA tools have been reported +for initial chart checks [16 19] . Hadley et al. [20] used an automated tool for veri fication of treatment plan parameters after ROIS upgrade and database migration. The transition from conventional manual checks toward automation of patient data QA is challenging. As ra- diation oncology practices migrate from paper-based medical records to EMRs and the integration of ROIS and hospital information systems advances, information stored in the ROIS has been signi ficantly in- creased, further complicating information relationships. The ROIS now includes all kinds of patient data and related data, such as patient de- mographics, clinic appointment schedules, diagnosis codes, treatment +plan and delivery records, planned and delivered doses, along with clinical notes in the form of text documents. In an integrated oncology environment, none of the information is of less importance than others, and con firmation of integrity is crucial for safe practice. +Although our automated QA tools check every bit of data, thanks to +the utilization of multiprocess and multithread techniques, the entire procedure of database integrity QA and other data QAs were able to be completed within hours without clinical practice interruption. +End-to-end tests following the clinical work flow, from CT simula- tion to treatment delivery, are helpful for detecting any issue related to ROIS interconnectivity with other clinical systems and to assess major +components performances. +Although we only applied this framework to ARIA upgrades, the +framework can be seamlessly applied to other ROISs. Also, this fra- +mework can be trimmed to cater to routine ROIS QA or a di fferent scenario, for example, only DICOM QA check is needed if only a DICOM +upgrade was performed for the ROIS. This framework proposed here is +very instrumental in paving the way to a widely accepted quality as- surance program for modern radiation oncology information system within the radiation oncology community, not only during speci fic events, such as upgrade or data migration, but also on a routine basis, +such as, quarterly or yearly. +The main purpose of this framework is to verify data integrity be- tween two ROIS states. It is not designed to check any dynamic data update in ROIS databases. Therefore, during the execution of this fra- mework, the ROIS software should be kept from updating the ROIS database, such as addition/deletion of a database table record or an EMR document. Such updates from the ROIS software will alter the ROIS database to change the ROIS state, which will lead to unreliable results. Although this framework can implicitly check some ROIS soft- ware functionalities and behaviors, it should not be used as a complete ROIS software QA tool. The ROIS software functionality QA should be fully performed by the vendors. + +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +35 diff --git a/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt b/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt new file mode 100644 index 0000000..23bb037 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/72-Testing MapReduce program using Induction Method.txt @@ -0,0 +1,158 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +2020 IEEE International Students' Conference on Electrical, Electronics and Computer Science +Testing MapReduce program using Induction Method + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. + +Ashish Kumar Rai +Department of Computer Science and Engineering Kamla Nehru Institute of Technology (KNIT), Sultanpur, UP, INDIA +email.ashishrai@gmail.com +Abstract—MapReduce is “divide and conquer” applied paradigm for processing large volume of data to filter out information to solve day to day complex challenges. MapReduce is core of big data applications. The challenging part to test these applications which also represent the characteristic of these applications are variation in data due to different format and sources. In other words, poor quality of input data can deviate system towards failure if not handled properly programmatically for variety of input data. MapReduce program itself based on transformations at different level based on the program logic This paper proposes the testing technique based on the mathematical induction principle and considered as extension or conjunction other testing techniques already in used either based on transformations analysis from input to output as in MRFlow. Proposed function testing can be used in business acceptance testing and showcase the correctness of program, further can detect many defects even before shipping bigdata application in live. + Keywords—MapReduce, Data Defects, Induction, MapReduce Testing, MapReduce business acceptance testing. +I. INTRODUCTION +Software testing is the process of finding error or defect in program or finding deviation (if any) in expected behaviour or end result. The purpose of this exercise is to improve the quality of software and reduce related cost of defect fix if encountered in live environment. To test bigdata application individual testing required in each stage from extraction of data, loading data in HFDS, transformation and utilization of data as per business requirement and further representing report or dashboard. To meet envisioned purpose of business application it is equally desirable to perform functional and non-functional testing. MapReduce should be considered as layer of bigdata application where key business rules get implemented. This makes testing of MapReduce as key factor for successful of the bigdata implementation. +Lecture “Big Data Essentials: HDFS, MapReduce and Spark RDD” available on coursera website, suggests performing unit, integration, system and acceptance testing [3]. This paper proposed another approach of functional testing based on mathematical induction principle and help to showcase correctness of MapReduce program. This approach should be considered as harmonizing other method used to perform functional testing of MapReduce application. +As per book Concrete Mathematics, Scientific acceptance of mathematical induction has already discussed in different articles and can be understood with example that we will climb as tall as we like on a stepping stool, by demonstrating that able to climb onto the foot rung (the premise) which from each rung we are able climb up to the following one (the step)[4]. +Dr. A. K. Malviya +Department of Computer Science and Engineering Kamla Nehru Institute of Technology (KNIT), Sultanpur, UP, INDIA +anilkumarmalviya@gmail.com +This metaphor helps to utilize mathematical induction to solve by formal verification. +The remaining paper is organized as follows: section2 describe about MapReduce paradigm, techniques, tools used for MapReduce and related work done in this area. Next section 3 proposed techniques presenting in this paper along with mathematical model of Induction method. Section 4 is case study which showcase the example of proposed MapReduce testing technique. Further section is conclusion notes for this paper. +II. BACKGROUND +As per press release on September 11, 2017 Gartner’s Hyper Cycle revealed that big data would achieve mainstream maturity within two to five year. This indicate wider acceptability and future technology in IT as bigdata application to support business need and identify hidden potential opportunities. Big Data shown high level of acceptance and maturity where MapReduce is intrinsic core framework for big data applications [1]. + +Fig. 1. Gartner’s Hyper Cycle +The three Vs - Variety, Volume and Velocity (sometime includes Veracity) - are commonly used to describe different aspects of big data or commonly known as Characteristics of Big Data. Sensors & Devices, Social Media, Enterprise and Internet are contributing exponential growth in data volume. With a rough estimation more than 2 trillion gigabytes of data created daily and need high velocity processing. The data may be structured and unstructured with diversify source such as error log, IoT, data from social networks includes but not limited to image data, recordings, visuals, spreadsheet data, + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. + +978-1-7281-4862-5/20/$31.00 ©2020 IEEE +text and many more. To resolve the 3Vs challenges of bigdata, +Hadoop is presented as a solution. As per Wikipedia & +Apache, Hadoop provides framework for distributed storage B. Testing MapReduce +and processing by using MapReduce and can be considered as Coursera lecture “Big Data Essentials: HDFS, MapReduce collection of multiple open source utilities to solve problem and Spark RDD” suggest multiple level testings need to which requires more computation and/or storage. Before be performed for MapReduce application - unit, finding test approach and strategy for bigdata application, one integration, system and acceptance testing [3]. +must understand that big data is not only about data volume. It ￿ Unit Testing – Unit testing for MapReduce program can should be considered more as verification process at each step be done separately for mapper and reducer function and and include functional and non-functional testing. Source level can be run on local node. This includes white box validation to verify correct extracted data loaded in HDFS, texting of code. Different tools available to test mapper Validation of MapReduce to verify business logic validation on or reducer function such as MRUnit [20] and Junit [21]. local node (or single node) and then validating on multiple Apart from mapper and reducer, MR Jobs can be tested nodes with validation of output target data to meet business locally on single JVM. +outcome. This paper proposed first attempt testing MapReduce ￿ Integration Testing – Once unit testing completed for based on mathematical induction and can be considered as part individual mapper and reducer function, integration of extended functional testing which provide further confidence testing should be performed on local machine validating on the correctness of MapReduce program and showcase output of mapper function is getting accepted by transformations are as expected. reducer function. Further Reducer should be able to +process data as per design. +A. MapReduce ￿ System Testing – After completion of integration testing, system testing should be performed and more +Define MapReduce is a framework to perform parallel likely on distributed environment, both functional and processing on large data stored in distributed over large number non-function testing should be completed before of machines. Each machine computes data stored locally, handling over application for acceptance testing. which in turn contributes to distribute and parallel processing. Function testing take cares of the business requirement The MapReduce follows the "divide and conquer" principle and validate if application is meeting functional aspects [15] where dividing problem to subproblem can be considered while non-functional testing focus on validation of as Map while collating results from subproblem can be performance aspects and volume capabilities of considered as Reduce. With advancement of Hadoop application. +framework as Hadoop2.0, MapReduce is more focused on data ￿ Acceptance Testing – This level of testing is performed processing while in Hadoop1.0 it was overloaded with cluster just before shipping application in live environment and resources management which is now handled by Yarn [5]. show case the application is working as per agreement +and compliant with business requirement. Most of the +MapReduce consists of two steps: time it should be performed by business users (or mix of +(1) Mapper tester along with business user) and considered as +(2) Reducer consent of acceptance for software application. So, Mapper function processes input data and convert them to MapReduce application should be tested in live like intermediate set of data, generally documented as key- value environment, generally black box testing approach is pair tuple, and further Reducer consume these key-value pair applied for this kind of testing [8]. +and combine or process them in smaller set of tuples. +C. Related Work +In logical terms, Map function applied on key value pair and MapReduce programs and their testing have been studied returns list of different key value set while Reduce function with different domain like finance, retail, health, defense consume this output and process them as another collection of [9][10] and found multiple challenges [18]. Most of the Big value for given key. The multiple process of mapper and Data applications are developed on top of the MapReduce reducer run in parallel on different node of Hadoop cluster programs [15] which process variety of data having multiple locally to solve large volume big data problem. sources consisting large volume and should be processed in high velocity. While Camargo and Vergilio studied MapReduce program testing and presented observation in their +paper [16]. + Authors L. Bu and Y. Xiong in their work tried to cover reachability testing in MapReduce program which run in concurrent distributed environment [11]. The paper showcases the design and implementation of a parallel reachability testing approach based on Hadoop MapReduce (PRT) with dynamic loading. +On the other paper, Authors worked on the detection of design fault in MapReduce where test data executed in parallel depends on test input data and test configurations. Authors +Fig. 2. Map Reduce logical workflow + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. + +propose MRTest testing based techniques presented in paper to automate detection of configuration and design fault [12]. +With reference to [13], authors propose a testing technique for different infrastructure configurations execution of test cases on various input data to find out infrastructure related issue or environmental issues. The testing technique helps to automate validation through test engine and applied on real world example. +Authors propose approach to test security policies for MapReduce [14]. Authors suggest FSM formalization for MapReduce in consideration of security policies specification conforming XACML language. +Chen, Ganapathi, Griffith and Katz studied MapReduce and presented paper with their finding as performance evaluation for MapReduce [17]. +Moran, Riva, and Tuya in paper “MRTree: Functional testing based on MapReduce’s execution behaviour”, showcases the functional testing method for MapReduce program based on tree node navigations depth and breadth coverage to find out potential faults in MapReduce program [19]. + +Fig. 3. Word count program - Reduce function +Moran, Riva, and Tuya in another paper “Testing data transformations in MapReduce programs” discussed approach to test MapReduce program based on data flow and proposed testing technique as MRFlow to analyze transformation in MapReduce program by depicting graph to cover different cases and to reveal defect [22]. For given WordCount program [7], authors presented MRFlow graph based on data flow. + +Fig. 4. MRFlow graph for Reduce function +In paper "Towards Ex Vivo testing of MapReduce applications”, authors suggested "Ex Vivo" context independent test approach to detect faults based live data and run on different environment [23]. On the other hand, in another paper authors systematically searches for bugs in MapReduce program and generates test cases [24]. +The author tries to showcase properties of inductive inference for showing correctness of program and using this for software testing [25]. +III. PROPOSED TESTING TECHNIQUE +From acceptance testing prospective, considering the complexity of MapReduce program, it is hard to test and verify if program is running correctly and application is working as per business requirement. Most of the time acceptance testing is done as black box testing with minimal code structure knowledge. To support acceptance testing of applications based on MapReduce program, an approach can be adopted which is influenced by mathematical induction. It suggests that for given domain if it can be proved that application is working fine for base case, data set and incremental data set as expected, application or program is more likely correct and conform to business requirement. In more simple words, induction proof supports program correctness. Online resource [27] further provides some example using induction to verify and prove correctness of program. +A. Matematical Induction +Finding mathematical results based on mathematical principle to showcase its larger applicability: an assertion A(i) for natural number i can be proved if base or initial case A(1) is true and assuming it is also true for A(n) where n is any other natural number n but it can be proved true for next natural number n+1 implies that A(n+1) is also true. The proof of initial case A(1) is the first step while proof of A(n+1) is called the induction step and n is called the induction parameter .It is basis for inductive definition [26]. The proof can be represented as following steps: +1. The base or initial case: proving statement holds for 0 or 1. +2. The induction step: with assumption statement holds for n and proving statement holds for n+1. +Axiom: P(0/1)&∀x(P(x)⊃P(x+1))⊃∀x P(x). +B. Applied Testing Technique +So far mathematical induction is used to prove program correctness using formal method or logical inference. Other approach based on induction is inductive testing. But we recommend using the applied understanding of mathematical induction for acceptance testing MapReduce application in combination black box approach. Since acceptance testing is performed business user or mix of tester along with business user. Following suggested algorithm can be used to test MapReduce application +Algorithm +Step 1. Run Application for primitive value which is +NULL +Step 2. Validate that the application is giving correct +output with NULL value +Step 3. Run Application for primitive value which is +Zero +Step 4. Validate that the application is giving correct +output with Zero value +Step 5. Run Application for base value which is minimal +data (or data set) +Step 6. Validate that the application is giving correct +output with minimal data set +Step 7. Run the application for given data set X and +record the output for further analysis +Step 8. Add ΔX (delta) in given data set x +Step 9. Run the application for X + ΔX data set +Step 10. Compare the output with step 7 +Step 11. Validate if data is as per the acceptance criteria Step 12. Output in Step 11 is as per the acceptance criteria Step 13. Iterate the program from step 7 for other data sets +(variety of data) and validate +Step 14. Validate output for other data sets to see +correctness of the program +CONCLUSION +The proposed testing technique is simple but effective to find bugs in MapReduce program without worrying about architectural complexity of underlying framework. It provides confidence for program correctness and validation results for acceptance testing ensuring meeting business functional requirement in live like environment. The MapReduce programs are more prone for defects due to incorrect validation, data type mismatch or following wrong processing for key value pair or exception handling. Even sometime defects can be for incorrect business calculations. These defects may cause program failure or may have business impacts. The proposed technique provides test cases for exception such as primitive cases along with validating them against business requirement for given data set show casing program correctness. +As future work we plan to apply sampling for variety or voluminous data or finding acceptance index for iteration on data set, further it can be automated with inclusion of machine learning for test coverage and execution. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. + +Depending on business requirement or logical inference +base case can be identified which represent minimal data set on +which program run. Step 1 and 3 validate program for NULL REFERENCES +and Zero to provide a fair chance to check negative test [1] Gartner press release https://www.gartner.com/en/newsroom/press- condition if MapReduce program is built considering no input releases/2017-09-11-gartner-hype-cycle-reveals-the-digitalization-of- +or blank data. Since we are doing acceptance testing, output for the-supply-chain +primitive cases for Zero or NULL along with base case can be [2] Weyuker, E. J. ‘Assessing test data adequacy through program validated based on business logic. For other input and output inference’, ACM Transactions on Programming Languages and data business may have defined domain for input and Systems, 5 (4), (1983) , 641-655. +corresponding range values for output. Step 7 recommends [3] Chtotpusr:s/e/wRAww.courseMra.aoprRg/eldecutcuer e/big-datTae-esstisnegn tials/testing-t48UaLecture running application program for given test data set and record [4] Ronald L. Graham, Donald E. Knuth, and Oren Patashnik ‘Review of +results considering it is inline as per business expectation. Now Concrete Mathematics: A Foundation for Computer Science, 2nd Step 8 suggests adding a known Δ (delta – small) value in input edition’Pg3 margin (1989) +data set X and validate if output changes are corresponding [5] Hadoop: open-source software for reliable, scalable, distributed input Δ changes in conjugation of output of step 7. Step 11 and computing. http://hadoop.apache.org/. +12 helps in validation of input and output matching with [6] Institutions that are using hadoop for educational or production uses. corresponding domain and range along with meeting business http://wiki.apache.org/hadoop.5. +logic of application. [7] Wordcount 1.0. http://hadoop.apache.org/docs/r2.7.0/hadoop- +mapreduce-client/hadoop-mapreduce-client- +Since MapReduce program usually run on variety of core/MapReduceTutorial.html#Example:_WordCount_v1.0 +volume data step 13 and 14 helps to iterate program for other [8] IEEE draft international standard for software and systems engineering– variety of data. To find how many iterations required sampling software testing–part 4: Test techniques, 2014. +or acceptance index can be identified. This converge [9] Schatz, M. C. Cloudburst: highly sensitive read mapping with acceptance testing objective to find program correctness and mapreduce. Bioinformatics 25, 11 (2009), 1363–1369. +validating application for meeting business requirement. [10] Kocakulak, H., and Temizel, T. T. A hadoop solution for ballistic image analysis and recognition. In High Performance Computing and +Simulation (HPCS), 2011 International Conference on (2011), IEEE, pp. +IV. CASE STUDY 836–842.. +While exploring the applicability of proposed testing [11] L. Bu and Y. Xiong (Eds.): SATE 2018, LNCS 11293, pp. 173–184, +2018. +techniques, it has been applied on popular know example of +MapReduce program WordCount[7] which is program written [12] "JAesuútos mMatoircánT,eAstnintgonoiaf BDeerstioglninoF,auClltasuidni oMdaepRlaedRuivcea AanpdplJiacvatiieornTs"uyina to find the frequency of every word in input text. To test IEEE Transactions on Reliability(2018) pp. 717-732. +WordCount program at unit level authors Moran, Riva, and [13] J. Morán, B. Rivas, C.D.L. Riva, J. Tuya, I. Caballero, M. Tuya suggested different testing techniques MRFlow based on Serrano,"Configuration/Infrastructure-aware testing of MapReduce data flow [22]. But approach suggested in this paper is programs", Advances in Science, Technology and Engineering Systems +primarily for acceptance testing and successful to find bug such Journal, vol. 2, no. 1, (2017) pp. 90-96. +as given program fails for primitive case NULL where no input [14] Sara Hsaini, Salma Azzouzi and My El Hassan Charaf "FSM Modeling file is given. Program is again validated with text file not oCfonTfeesretinncge (2S0e1cu9r)itpyp . P1o4l8ic0i-e1s48f5o.r MapReduce Frameworks" in IEEE +having any word for another primitive case. Further program is [15] Sharma, M., Hasteer, N., Tuli, A., and Bansal, A. Investigating the validated for base case where only one word is present in input inclinations of research and practices in hadoop: A systematic review. In text file. WordCount program is then run on given text file as Confluence The Next Generation Information Technology Summit step 7 execution and result is recorded. Further given text file is (Confluence), 2014 5th International Conference- (2014), IEEE, pp. +modified by adding known frequency of certain words. 227–231. +Program ran on modified text file as step9 and output is [16] Camargo L. C., and Vergilio S. R. Mapreduce program testing: a validated for known frequency changes in added words. s(SysCteCmCa)t,i3c2nmdaIpnpteinrngatisotundaly.CoInnf erCehniclee aonf thCeoCmopmutpeurtaSticoine n(2c0e13S)o. ciety + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. + +[17] Chen, Y., Ganapathi A., Griffith R., and Katz R. The case for evaluating mapreduce performance using workload suites. In Modeling, Analysis & Simulation of Computer and Telecommunication Systems (MASCOTS), 2011 IEEE 19th International +[18] Gudipati, M., Rao, S., Mohan, N. D., and Gajja, N. K. Big data: Testing approach to overcome quality challenges. Big Data: Challenges and Opportunities (2013), 65–72. +[19] J. Moran, C. de la Riva, and J. Tuya, “MRTree: Functional testing based on MapReduce’s execution behaviour,” in proceedings International Conference Future Internet Things Cloud, 2014, pp. 379–384. +[20] Apache MRUnit. [Online]. Available: http://mrunit.apache.org. +[21] JUnit. [Online]. Available: http://junit.org. +[22] J. Mor´an, C. de la Riva, and J. Tuya, “Testing data transformations in MapReduce programs,” in Proc. 6th Int. Workshop Automat. Test Case Design, Selection Evaluation, 2015, pp. 20–25. +[23] J. Mor´an, C. de la Riva, and J. Tuya, “Testing data transformations in MapReduce programs,” in proceedings. IEEE International Conference on Software Quality, Reliability and Security, 2017, pp. 73–80. +[24] Christoph Csallner, Leonidas Fegaras y Chengkai Li. New Ideas Track: Testing MapReduce-Style Programs. Proceedings of the 19th ACM SIGSOFT symposium and the 13th European conference on Foundations of software engineering. Pages 504-507. +[25] Zhu, H.: A formal interpretation of software testing as inductive inference. Software Testing, Verification and Reliability 6(1) (1996) 3– 31 +[26] Hazewinkel, Michiel, [1994], "Mathematical induction", Encyclopedia of Mathematics, Springer Science+Business Media B.V. / Kluwer Academic Publishers, ISBN 978-1-55608-010-4 ed. (2001) [Online] https://www.encyclopediaofmath.org/index.php/Mathematical_induction +[27] Lecture “Verifying the Correctness of Programs” [Online] http://www.cs.cornell.edu/courses/cs312/2006sp/lectures/lec10.html +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +SCEECS 2020 +Authorized licensed use limited to: University of Exeter. Downloaded on June 19,2020 at 13:26:41 UTC from IEEE Xplore. Restrictions apply. diff --git a/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt b/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt new file mode 100644 index 0000000..28e24e8 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/73-BigFuzz_ Efficient Fuzz Testing for Data Analytics Using Framework Abstraction.txt @@ -0,0 +1,148 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +BigFuzz: Efficient Fuzz Testing for Data Analytics Using +Framework Abstraction +Qian Zhang Jiyuan Wang Muhammad Ali Gulzar +University of California, Los Angeles University of California, Los Angeles Virginia Tech +zhangqian@cs.ucla.edu wangjiyuan@g.ucla.edu gulzar@cs.vt.edu + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Rohan Padhye +Carnegie Mellon University rohanpadhye@cmu.edu +ABSTRACT +As big data analytics become increasingly popular, data-intensive scalable computing (DISC) systems help address the scalability is- sue of handling large data. However, automated testing for such data-centric applications is challenging, because data is often in- complete, continuously evolving, and hard to know a priori. Fuzz testing has been proven to be highly effective in other domains such as security; however, it is nontrivial to apply such traditional fuzzing to big data analytics directly for three reasons: (1) the long latencyofDISCsystemsprohibitstheapplicabilityoffuzzing:naïve fuzzing would spend 98% of the time in setting up a test environ- ment; (2) conventional branch coverage is unlikely to scale to DISC applications because most binary code comes from the framework implementation such as Apache Spark; and (3) random bit or byte level mutations can hardly generate meaningful data, which fails +to reveal real-world application bugs. +We propose a novel coverage-guided fuzz testing tool for big data analytics, called BigFuzz. The key essence of our approach +is that: (a) we focus on exercising application logic as opposed to increasingframeworkcodecoveragebyabstractingtheDISCframe- work using specifications. BigFuzz performs automated source to source transformations to construct an equivalent DISC application suitable for fast test generation, and (b) we design schema-aware data mutation operators based on our in-depth study of DISC ap- plication error types. BigFuzz speeds up the fuzzing time by 78 to 1477X compared to random fuzzing, improves application code coverage by 20% to 271%, and achieves 33% to 157% improvement in detecting application errors. When compared to the state of the +art that uses symbolic execution to test big data analytics, BigFuzz is applicable to twice more programs and can find 81% more bugs. +KEYWORDS +fuzz testing, big data analytics, test generation +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). +ASE ’20, September 21–25, 2020, Australia ©2020 Copyright held by the owner/author(s).ACM ISBN 978-1-4503-6768-4/20/09. https://doi.org/10.1145/3324884.3416641 +Miryung Kim +University of California, Los Angeles miryung@cs.ucla.edu +ACM Reference Format: QianZhang,JiyuanWang,MuhammadAliGulzar,RohanPadhye,andMiryung Kim. 2020. BigFuzz: Efficient Fuzz Testing for Data Analytics Using Frame- work Abstraction. In 35th IEEE/ACM International Conference on Automated Software Engineering (ASE ’20), September 21–25, 2020, Virtual Event, Aus- tralia. ACM, New York, NY, USA, 12 pages. https://doi.org/10.1145/3324884. 3416641 +1 INTRODUCTION +Emerging technologies are producing much data and the impor- tanceofdata-centricapplicationscontinuestogrow.Data-intensive scalablecomputing(DISC)systems,suchasGoogle’sMapReduce[30], Apache Hadoop [1], and Apache Spark [2], have shown great promises to address the scalability challenge of big data analytics. Although DISC systems are becoming widely available to industry, DISC applications are difficult to test and debug. Data scientists of- ten test DISC applications in their local environment using sample data only. These applications are thus not tested thoroughly and may not be robust to bugs and failures in the production setting. +The correctness of DISC applications depends on their ability +to handle real-world data; however, data is inherently incomplete, continuously evolving, and hard to know a-prior. Motivated by the successes of systematic test generation tools [33,34,62], a few have been proposed for dataflow-based DISC applications [38, 45, 52]. For example, BigTest [38] uses symbolic execution to automati- cally enumerate different path conditions of a DISC application and generate concrete inputs using an SMT solver. However, its applica- bility is limited to the dataflow operators (e.g., map, reduce, join, etc.) where symbolic execution is supported, and limited by the path exploration capability of the underlying symbolic execution engine and an SMT solver. In other words, developing a robust test generation tool for DISC applications remains an open problem. +In recent years, coverage-guided mutation-based fuzz testing has emerged as one of the most effective test generation techniques for large software systems [17, 49]. Such fuzz testing techniques are based on implicit assumptions that it takes a relatively short amount of time to repetitively run programs with different inputs and arbitrary byte level mutations are likely to yield reasonable inputs. In fact, most fuzzing techniques start from a seed input, generate new inputs iteratively by mutating the previous inputs, andaddnewinputstotheinputqueueiftheyexerciseanewbranch. +* This research was done, while the third and fourth authors were graduate students at UCLA and UC Berkeley respectively. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +However, our experience tells us that fuzzing cannot be applied to big data analytics directly. First, the long latency nature of DISC systems prohibits the efficacy of traditional fuzzing. While tradi- tional fuzzing techniques assume thousands of invocations per second, for example, Apache Spark applications would need about 10 to 15 seconds to initialize the Spark context for each run—job scheduling, data partitioning, and serialization all contribute to increased latency. Second, low-level mutations (e.g., flipping a bit or byte) in existing naïve fuzzers can hardly explore corner cases that represent realistic application bugs. Lastly, grammar-aware fuzzers[35,43,70]existtoreducethetimerequiredforconstructing meaningful inputs. However, they require a user to provide gram- mar rules and, by definition, they do not produce inputs violating the user-provided grammar rules. +In this paper, we lay the groundwork for embodying a coverage- guided, mutation-based fuzz testing approach for big data analytics. The key insight behind BigFuzz is that fuzz testing of DISC applica- tions can be made tractable by abstracting framework code and by analyzing application logic in tandem. Our key idea is to perform source-to-source transformation of a DISC application to a seman- tically equivalent, yet a framework-independent program that is more amenable to fuzzing. +Based on the insight that a DISC application developer writes ap- plicationlogicintermsofuser-definedfunctionsandconnectsthem usingdataflowoperatorsintheDISCframework, BigFuzz focuseson exercising application logic as opposed to the DISC framework im- plementation. BigFuzz uses a two-level instrumentation method to monitor application-specific coverage, while modeling the different outcomes of dataflow operations. As such combination of behav- ior modeling is independent of the underlying DISC framework implementation, we can abstract the framework with executable specificationsandgenerateaSparkcontextfreeprogramtomitigate the long latency caused by the DISC framework. An application de- veloper is not required to write any custom specifications, because the specifications for dataflow operators such as mapand reduce do not need to be re-written for each application. BigFuzz fully automates this process of constructing a semantically equivalent DISC application through source to source transformation. +As opposed to random bit or byte-level input mutations, we de- sign schema-aware mutation operations guided by real-world error types. These mutation operations increase the chance of creating meaningful inputs that map to real-world errors. To inform the design of such data mutation operators, we conducted a systematic study on common error types and root causes in Apache Spark and Hadoop applications using two complementary sources: Stack Overflow[3]andGithub[4].Thestudyidentifiedtencommonerror types, which we map and encode in terms of six different mutation operators in BigFuzz. +We evaluate BigFuzz on a benchmark of twelve Apache Spark ap- plications. We comparethe time togenerate test inputsand theiras- sociated error-finding capabilities against two baseline techniques: random fuzzing, and symbolic-execution based testing. With frame- work abstraction, BigFuzz is able to speed up the fuzzing time by 78 to 1477X compared to random fuzzing. Schema-aware mutation operations can improve application code coverage by 20 to 200% with valid inputs as seeds, which leads to 33 to 100% improvement in detecting application errors, when compared to naive random +fuzzing. Even without valid input seeds, BigFuzz improves applica- tioncodecoverageby118to271%anderrordetectionby58to157%, demonstrating its robustness. We show that BigFuzz is applicable to twice more applications and can find 81% more bugs than the state of the art, BigTest. +In summary, this work makes the following contributions: +(1) We propose a fuzz testing technique called BigFuzz that targets DISC applications by automatically abstracting the dataflow behavior of the DISC framework with executable specifications. This novel approach can also be generalized to other systems with long latency. +(2) We propose an automated instrumentation method to moni- tor application logic in conjunction with how dataflow op- erators are exercised in terms of their dataflow equivalence class coverage. +(3) Wepresentschema-awaremutationoperationsthatareguided by real-world errors encountered in DISC applications. To our knowledge, we are the first to design a fuzz testing tech- nique by empirically studying and codifying mutations that correspond to real-world DISC bugs. +(4) Our experimental evaluation on 12 Apache Spark applica- tions demonstrates that BigFuzz outperforms prior work in terms of code coverage and error-detection capability. +We provide access to artifacts of BigFuzz at https://github.com/ qianzhanghk/BigFuzz. +2 BACKGROUND +Apache Spark. BigFuzz targets Apache Spark, a widely used data intensive scalable computing system but can generalize to other DISC frameworks. Spark achieves scalability by creating Resilient Distributed Datasets (RDDs), an abstraction of distributed collec- tion[73].ProgrammerscantransformRDDsinparallelusingdataflow operations, e.g.,val newRDD = RDD.map(s => s.length).Dataflow operators such as filter, map, and reduce are implemented as higher-order functions that take a user-defined function (UDF) as an input argument. The actual evaluation of an RDD occurs when an action such as count or collect is called. For example, a Spark application developer writes application logic in terms of UDFs and connects them using dataflow APIs. To execute the program, Spark first translates a program into a Directed Acyclic Graph (DAG), where vertices represent various operations on the RDDs, and then executes each stage in a topological order. +Thecommonindustrypracticefortestingsuchbigdataanalytics applications remains running them locally on a randomly sampled dataset.Testingwithsampledataisoftenincompletewhichleadsto rare buggy cases in production runs. Often Spark programs run for days and then crash without an obvious reason. Additionally, the start up latency associated with invoking the Spark frameworkand Block Manager Mastercan take several seconds for simply setting up an execution environment and repetitive data partitioning, job scheduling, serialization, and deserialization to support distributed execution all contribute to increased latency. Thus random fuzzing would be prohibitively expensive to test big data analytics. +Fuzz Testing. Fuzz testing such as AFL [17] has been proven to be highly effective in synthesizing test inputs that achieve high code coverage and find bugs. Given an input program, it instruments + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +Figure 1: Approach Overview of BigFuzz + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +1 val loan = sc.textFile("account_history.csv") +2 // Input with zipcode, base loan, years, and rate +3 .map{ line => val cols = line.split(",") 4 (cols(0),cols(1).toFloat, +5 cols(2).toInt,cols(3).toFloat) } +6 //Return zipcode, base loan, years, and rate +7 . map{ s => +8 val a = s._2 +9 for(i <- 1 to s._3) +10 a = a * (1 + s._4) +11 (s._1, a) } +12 // Return zipcode and final loan +13 val locations = sc.textFile("zipcode.csv") +14 //input with zipcode and city +. map{ s => +1516➊ val cols = s.split(",") +17 (cols(0), cols(1) } +18 //Return zipcode and city +19 .filter{ s => s._2 == "New York" } +20 val output = loan.join(locations) +21 . map{ s => +22 if(s._2._1 > 10000) ("Property Loan",10000) 23 else if(s._2._1 > 1000) ("Car Loan",1) +24 else ("Credit Debt",1) } +25 //Return three categories based on the loan amount 26 .reduceByKey( _+_ ) +1 ArrayList results0 = LoanSpec.read(inputFile1); +2 ArrayList results1 = LoanSpec.map1 (results0); +3 ArrayList results2 = LoanSpec.map2 (results1); +4 ArrayList results3 = LoanSpec.read(inputFile2); +5 ArrayList results4 = LoanSpec.map3 (results3); +6 ArrayList results5 = LoanSpec.filter1 (results4); ➊ +7 ArrayList results6 = LoanSpec.join1(results5, results2); +8 ArrayList results7 = LoanSpec.map4 (results6) +9 ArrayList results8 = LoanSpec.reduceByKey1 (results7) +(b) A transformed program LoanType.java with executable specifications +1 public ArrayList map3(ArrayList input){ +2 ArrayList output = new ArrayList<>(); ➊ 3 for (String item: input){ +4 output.add( Map3.apply(item) );} +5 return output;} +(c) Specification implementation of map3in LoanTypeSpec.java +1 public class Map3 { +2 static final Map3 apply(String line2) { +3 String cols[]=line2.split(","); +4 return new Map3(cols[0],cols[1]); } + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +(a) A DISC application LoanType.scala (d) The extracted UDF from lines 14 to 16 of Figure 2a is represented as Map3.java +Figure 2: Example code transformation and framework abstraction + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +the program’s bytecode, iteratively generates new inputs by mu- tating several bits or bytes of the seed input, and collects coverage feedback by executing the instrumented program with new inputs. All inputs that exercise a new code branch are then be saved for further mutation. The implicit assumption underlying such itera- tive fuzzing is that the target program can run fast, (i.e., thousands of invocations per second); unfortunately, this assumption is false for many long latency applications such as big data analytics. For example, initializing the Spark context in local model to initiate a distributed data pipeline takes 19 seconds, which correspond to 98% of the total execution time with a typical testing input. The long latency prohibits the applicability of fuzzing for efficient test generation. Besides, naively monitoring branch coverage in DISC applications is unlikely to exercise application logic adequately, since most binary code comes from the DISC framework imple- mentation (e.g., roughly 700 KLOC for Apache Spark). Under this circumstance, naive attempt to increase code coverage may eventu- ally run out of memory. Furthermore, random byte-level mutations can hardly generate meaningful structured or semi-structured data to explore application logic effectively. +3 APPROACH +BigFuzz contains three components that work in concert to make coverage-guided fuzz testing tractable for big data analytics. Fig- ure 1 shows (A) abstraction of dataflow implementation using source-to-source transformation with extracted user-defined func- tions, discussed in Section 3.1, (B) two-level instrumentation for coverage monitoring, discussed in Section 3.2), and (C) input muta- tionsgearedtowardsbigdataanalyticerrorsbasedonourempirical study,discussedinSection3.3.Thisapproachisbasedontheinsight that(1)wecanreducelonglatencyofDISCapplicationsbyabstract- ingdataflowimplementationinaDISCframeworkusingexecutable specifications and (2) we can focus on exercising application logic rather than the entire framework by monitoring code coverage of user-defined functions in tandem with equivalence classes of ab- stracted dataflow behavior. Although BigFuzz is designed for Spark programs, its key idea can generalize to other DISC frameworks such as Hadoop by rewriting the dataflow operator APIs to our current set of corresponding specification implementation. +3.1 Framework Abstraction for Fuzzing +As discussed in Section 2, DISC applications have high latency, making them not suitable for traditional fuzz testing because they + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +Table 1: Dataflow Operator and Corresponding Equivalence Classes + +Spark Dataflow Operator Transformed Operator Equivalences Classes def filter(udf:T→ Boolean): RDD[T] +Return an RDDthat satisfies a predicate udf:T→Boolean ArrayList filter (ArrayList Input) +Return an ArrayList of elements passing udf where udf:T → Booleean is implemented in filter F1: Non-Terminating: ∃t.udf (t) = true F2: Terminating: ∃t.udf (t) = f alse def join[W](other: RDD[(K,W)]):Rdd[(K,(V,W))] Return an RDDcontaining all pairs of elements with matching keys in this and other RDDs. ArrayList join (ArrayList L, ArrayList R) Return an ArrayList of elements from left ArrayList tL ∈L and right ArrayList tR ∈R, with matching keys tL,key = tR,key J1: Non-Terminating: ∃tL,tR.tL,key = tR,key J2: Terminating: ∃tL,∀tR.tL,key! = tR,key J3: Terminating: ∃tR,∀tL.tR,key! = tL,key def map[U](udf:T→U) +Return a new RDD by applying udf:T→ U t of this RDD. ArrayList map (ArrayList Input) +Return a new ArrayList by applying a udf:T→ Uto this ArrayList where udf:T→ Uis implemented in map. M: Non-Terminating: always non-terminated def reduceByKey(udf:(V,V) → V) : RDD [K,V] Merge the values for each key using an associative reduce function. ArrayList reduceByKey (ArrayList Input) Merge the values for each key using udf:(V,V) → V where udf:(V,V) → Vis implemented in reduceByKey R: Non-Terminating: always non-terminated + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +spendseveralsecondsjusttoinitializeSpark’sexecutioncontextfor each run. Theoretically, the long start-up latency can be somewhat reduced by sharing one Spark execution environment for multiple runs;however,suchpracticeisstillnotenoughtoachievemillionsof executions per minute, because each run still needs to pass through +a data partitioner, a query optimizer, a job scheduler, and a data serializer/deserializer, etc. +In DISC frameworks, the implementation of dataflow and rela- tional operators is influenced by and universally agreed upon the semantics of such operators [68]. For example, although a dataflow operator join may have a specialized physical implementation in each framework (e.g., hash join), it has the same consistent logical semantics across all DISC frameworks. BigFuzz takes advantage of this observation, rewrites a DISC application into an equivalent applicationthatusesdataflowspecifications,andmonitorsdifferent equivalence class coverage of dataflow operations. For example, filter has two equivalence classes—one passing the filter predi- cate and the other not passing the filter. Because dataflow operators are deterministic and state-less [72], the transformed program is guaranteed to be equivalent to the original program. For example, map{x => (x,1)} will always give the same output for the same input for both the spec-based program and the original program. +We map each dataflow operator’s implementation to a corre- sponding simplified yet semantically-equivalent implementation, which we call executable specifications. Such specifications help eliminate the dependency on the framework’s code, transforming +a DISC application into an equivalent, simplified Java program that can be invoked numerous times in a fuzzing loop. +BigFuzz automates this process of rewriting in two steps: (1) UDF extraction and (2) source to source transformation. Figure 2 illus- tratesthisprocessusinganexampleDISCapplicationthatidentifies thefrequencyofeachloantypewithinametropolitanarea.Thispro- gram is a variation of one of the DISC Benchmark [38]. We formu- lateadistributed,RDD-basedimplementationusingSpark’sAPIs(➊ in Figure 2a) to a simplified, executable specification of mapin Fig- ure 2c. Table 1 shows a few sample mappings between Spark RDD’s dataflow implementation APIs, equivalent spec-implementations using ArrayList, and a set of corresponding equivalence classes for each dataflow operator. +Step 1. User-Defined Function (UDF) Extraction. To re-write a DISC application to use executable specifications only, BigFuzz de- composes the application into two components: (1) a direct acyclic graph (DAG) of dataflow operators and (2) a list of corresponding UDFs. Internally, BigFuzz decompiles the bytecode of the original +application into Java source code and traverses Abstract Syntax Tree(AST)tosearchforamethodinvocationcorrespondingtoeach dataflow operator. The input arguments of such method invoca- tions represent the UDFs, which are stored as separate Java classes as shown in Figure 2d. +Step2.SourcetoSourceTransformation. BigFuzz usestheDAG extracted in the previous step to reconstruct the DISC application in the same, interconnected dataflow order using executable specifi- cations. Such dataflow spec implementation takes in an ArrayList object as input, applies the corresponding UDF on each element of the input list, and returns an output ArrayList. For example, class LoanSpec.map3 (➊ in Figure 2b) represents the equivalent spec implementation using ArrayList that corresponds to map +• in Figure 2a. It takes in results3 from its upstream opera- tors and returns an ArrayList result4 for downstream operator, LoanSpec.filter1. BigFuzz selects the corresponding UDFs from +the list of UDFs extracted from step 1 and weaves them with the equivalent specifications shown in column 2 of Table 1. For exam- ple, Java classMap3has method apply mapping to the original UDF +• in Figure 2a, and this method is invoked on each element of the input list as seen in Figure 2c. +The above rewriting from a Spark application in Scala or Java to an equivalent Java application reduces the latency of running a DISC application, while retaining the same semantics. It also makes it easier to collect guidance metrics such as branch coverage by leveraging existing tools JQF [55], which takes Java bytecode as input and collects various guidance metrics for fuzz testing. +3.2 Application Specific Coverage Guidance +Priorworkfindsthatbranchcoverageisaneffectiveguidancemech- anism for feedback-guided fuzz testing, pushing test generation towards hard-to-reach corners [17, 44, 56]. Generally, feedback- guided fuzzing techniques instrument a program’s bytecode to label each constituent branch and if an input exercises a previously- unseen branch of the program, this input is appended in an input queue and the branch coverage is fed back into the fuzzer. +However, we observe that such branch coverage guidance mech- anism cannot be applied to fuzz testing of big data analytics for two reasons. First, it cannot differentiate user-defined functions from framework code and can thus push test generation naively toward exploring the internals of DISC framework, as opposed to applica- tion logic. Second, it cannot effectively monitor different equiva- lence classes of dataflow operators though prior studies [38,45,52] argue that numerous errors originate from untested equivalence + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +BigFuzz: Efficient Fuzz Testing for Data Analytics Using Framework Abstraction ASE ’20, September 21–25, 2020, Australia +Table 2: Data Collection for Error Type Study. +and thus individual data records stop at this filter. BigFuzz in- struments “TraceLogger.get().emit(new FilterEvent(arm))” in specification implementation of filter to emit FilterEvent with a specific arm to the trace logger. In this way, BigFuzz retains the DISC framework’s behavior on the original application code, while abstracting its coverage guidance mechanism to the level of equivalence classes for individual dataflow operator uses. Coverage Guidance for User-Defined Function. DISC applica- tiondeveloperwritesapplicationlogicintermsofuser-definedfunc- tions (UDFs) and connects them using dataflow operators. These UDFs are standard library based Scala or Java implementations. To restrict normal coverage guidance to the body of UDFs (e.g., Figure2d),BigFuzz usesaselectiveinstrumentationschemeinASM, while ignoring all other dependent libraries. This combination of monitoring dataflow equivalence coverage together with control flow events in the body of UDFs constitutes the joint dataflow and user-defined function path coverage (JDU path coverage), which essentially represents the behavior of application logic. + Keyword Total Inspected StackOverflow-Spark apache spark exception 2430 top 150 apache spark error 3780 top 200 apache spark wrong/ unexpected/inconsistent result/output 143 143 StackOverflow-Hadoop hadoop exceptions 2567 top 100 hadoop error 9585 This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt b/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt new file mode 100644 index 0000000..4b9e412 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/74-Failure_Mode_Effect_Analysis_and_another_Methodolo.txt @@ -0,0 +1,108 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Annals of Emerging Technologies in Computing (AETiC) +Vol. 4, No. 3, 2020 +Research Article +Failure Mode & Effect Analysis and another Methodology for Improving Data Veracity and Validity +Ana Elsa Hinojosa Herrera*, Chris Walshaw and Chris Bailey +School of Computing & Mathematical Sciences, University of Greenwich, UK +aehinojosa@ieee.org; C.Walshaw@greenwich.ac.uk; C.Bailey@greenwich.ac.uk *Correspondence: aehinojosa@ieee.org +Received: 29th April 2020; Accepted: 1st June 2020; Published: 1st July 2020 +Abstract: Failure Mode & Effect Analysis (FMEA) is a method that has been used to improve reliability of products, processes, designs, and software for different applications. In this paper we extend its usage for data veracity and validity improvement in the context of big data analysis and discuss its application in an electronics manufacturing test procedure which consists of a sequence of tests. Finally, we describe another methodology, developed as a result of the DVV-FMEA application which is aimed at improving the tests' repeatability and failure detection capabilities as well as monitoring their reliability. +Keywords: Big Data; Data Veracity; Data Validity; FMEA; Statistics; Electronics Manufacturing; Quality Assurance; Test Limits Optimisation +1. Introduction +The market of data analytics was valued at USD 904.65 million in 2019 and is expected to reach USD 4.55 billion by 2025 [1]. Moreover, the use of data driven techniques is popular in smart manufacturing. Cost reduction can be achieved by mining data for predicting the quality of a batch, improving robustness of processes, or by reducing the process cycle time, for example. +With regards the definition of big data, the authors in [2] describe it using 1C for complexity and 11Vs for: Volume, Velocity, Variety, Volatility, Virtual, Visibility, Vendee, Vase, Value, Veracity, and Validity. In this paper we cover the last 2 Vs of the list. +Failure Mode and Effect Analysis (FMEA) is a method that has been used to improve reliability, testability and safety of hardware designs, processes, products, and software, for example [3-6]. In electronics, hardware (HW) FMEA has been used to improve electronics reliability [4], and in [7] software (SW) FMEA was used to validate embedded real time systems. +In this paper we extend the usage of the FMEA method to improve data veracity and validity. The proposed extension (DVV-FMEA) is illustrated with an electronics manufacturing application for quality assurance. From using DVV-FMEA in this application a novel methodology was motivated for evaluating, improving and monitoring the definition of production tests. +This article is organized as follows. Section 2 introduces the data veracity and validity concepts and main causes that commonly affect data quality. Section 3 discusses the usage of FMEA for data improvement and its application in production testing data. Sections 4 and 5 present the methodology for test definition evaluation, improvement, and monitoring, in addition to its application in a production test dataset, respectively. And finally, Section 6 concludes the article and states future work. +Ana Elsa Hinojosa Herrera, Chris Walshaw and Chris Bailey, “Failure Mode & Effect Analysis and another Methodology for Improving Data Veracity and Validity”, Annals of Emerging Technologies in Computing (AETiC), Print ISSN: 2516-0281, Online ISSN: 2516-029X, pp. 9-16, Vol. 4, No. 3, 1st July 2020, Published by International Association of Educators and Researchers (IAER), DOI: 10.33166/AETiC.2020.03.002, Available: http://aetic.theiaer.org/archive/v4/v4n3/p2.html. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +AETiC 2020, Vol. 4, No. 3 15 +2. Data Veracity and Validity +Poor data veracity and validity improvement is relevant for big data applications, because low quality data could generate inaccurate models and unreliable information, resulting in incorrect data- driven decision taking. In this section we discuss the characteristics of data veracity and validity. +2.1. Data Veracity +Data veracity is the ability to understand the data and the analytical process applied to a dataset. It covers aspects related to confidence in the dataset or data source, for example data integrity, availability, completeness, consistency, and accuracy and in addition, transparency and clarity in the processes used to generate, improve and analyse the dataset [2, 8, 9]. Authors in [10] discuss a general list of causes that frequently affect data veracity: +· Measurement system limits: For example, equipment calibration, human errors, and non- standard measurement processes. +· Limits of features extraction: This could be evaluated by measuring the precision of correctness and completeness. +· Data integration limits: In real applications it is useful to gather and combine information from different sources, but sometimes it is challenging due to the diversity of data sources or formats. +· Data ambiguity and uncertainty: In addition to the uncertainty due to data integration there are other sources of data ambiguity, for example ambiguities of natural language, uncertainty related to the information source and low relevance of the information with respect to other available information [11]. +· Data falsification and source collusion: In [12] authors model data falsification attack as a constrained optimization problem with two parameters: efficacy and covertness of the attack. The first parameter is related to the degradation in the detection performance, and the second one is the probability that the attacker will not be detected. In the formulation, the attacker would maximize the attack efficacy while controlling its exposure to the defence mechanism. +2.2. Data Validity +Data validity refers to data worthiness, which may change over time and during the process under study. For example, data generated before relevant changes in the process is not valid to generate models of the current state [2]. +The authors in [13] discussed data staleness for information systems where data is frequently updated. This data freshness characteristic is relevant, for example, in data streaming applications where information quickly becomes obsolete. +3. Data Veracity and Validity Failure Mode and Effect Analysis +In Section 2 we discussed the importance of veracity and validity. In addition, we noted its impact on data-based decision-making success. In this section we are going to present the DVV- FMEA steps to follow for improving these two elements of the big data definition, and the results of its usage in an electronics manufacturing quality assurance application. +3.1. Steps of DVV-FMEA +The DVV-FMEA is like HW FMEA, although with differences in System Identification, List of Failure Mode, Causes Identification, and Effect Analysis steps. The details as follows: +Step 1. System Identification: In data-driven analysis, it is common that the modules identified in the process before using datasets for analysis consist of data generation, data storage, data gathering, and data pre-processing. Nevertheless, in some applications where data is streaming the storage module could be different. +As in SW FMEA, the variables or features in the dataset must be listed for its evaluation. When working on big datasets which comprise a big quantity of variables, it seems sensible to group them based on engineering feature or data processes similarities. +Step 2. List of Failure Modes Generation: It make sense to split the meeting time into the different modules and generate a failure modes list for each of these. The brain-storming meeting(s) should include team members with know-how and expertise in the data process and application. +Step 3. Causes Identification: List the causes of failure modes and score them by its occurrence. We recommend including causes related to measurement system limits, features extraction limits, data integration limits, data ambiguity and uncertainty, data falsification and source collusion, data staleness. Ishikawa diagram is a useful tool which could be used as a guidance for causes identification. In Fig. 1 is the version we propose for causes identification in DVV-FMEA. It could be used for each failure mode identified in Step 2. + +Figure 1. Ishikawa Diagram for DVV Failure Modes Causes +Step 4. Effect Analysis: In this step the effects of the failures are listed, and each of the effects is scored by its severity. It makes sense to include impacts to confidence in the dataset or data source, data integrity, data availability, data completeness, data consistency, data model, or analysis accuracy, execution time or efficiency, ability to replicate results or analysis, and data worthiness. +As a guidance during the meeting, the DVV-FMEA leader could ask if and how each of the impacts listed above impacts the failure mode and fill it in the DVV-FMEA table. +The following steps are the same as in HW FMEA. +Step 5. Detection mechanism identification: A list with the available mechanisms that helps detecting the failure modes is generated. Each failure mode should have a score of its detectability. +Step 6. Failure mode prioritization: In order to improve the efficiency of this method, the list of failure modes should be filtered based on the Risk Priority Number (RPN), which is calculated as in: +Equation 1. Risk Priority Number += × × +Step 7. Process or Product Improvement: Based on the prioritization and resources available, the next step is to generate and execute an improvement plan, which contains actions to improve the data veracity and validity. These changes should reduce the score of severity, occurrence, or detection. It seems likely that severity score is less frequently reduced. +3.2. Severity, Occurrence, and Detection Scales +For the scaling it makes sense to use simple scales for severity, occurrence, and detection scores. For example, a 5 levels measure such as the Likert scale, which is easy to use. In Table 1 is detailed the ranking scale we recommend. Whenever historical data or a previous DVV-FMEA is available, it could be used to quantify the severity, likelihood, or detectability rates. +Table 1. Occurrence, Severity, and Detection Ranking Scale +Ranking Occurrence Severity Detection 1 No known failures Very low or none Almost certain detection 3 Isolated failures Low or minor Remote chance of detection 5 Occasional failures Moderate or significant Moderate chance of detection 7 High rate of failure High High chance of detection 10 Failure is almost inevitable Very high or catastrophic Cannot be detected 3.3. DVV-FMEA Application in Production Testing +In this subsection we include DVV-FMEA usage to establish the pre-processing step of the data analysis of an electronics manufacturing application. Experts in the manufacturing and data processes were part of the team that generated the DVV-FMEA table. +In this application the input variables are the result of individual tests in a sequence that runs in a stop-on-fail scenario. For some tests in the sequence, a feature is measured and then compared to upper, lower or both limits to classify faulty devices. More details of the application and intermediate steps of the DVV-FMEA can be found in [14]. +As a result of using the DVV-FMEA, and based on the RPN, the list of +60 failure modes related to data validity and veracity was reduced to 14. Some of them are included in Table 2. Most of the improvements comprise R scripts that pre-process data before its usage for analysis. The scripts detect incorrect data and eliminate it, correct formats, and standardize data pre-processing steps to ensure repeatability, consistency, efficiency, and confidence. +Table 2. DVV-FMEA for an Electronic Manufacturing Application +System Module Input Failure Mode RPN Data Generation Overall result The overall result is not consistent 490 Data Generation Text File The file format is not correct 100 Data Generation Test: 90, 480 The test was unsuccessful to detect faulty devices 150 Data Generation Test type Different to test sequence ‘p’ 50 Data Generation Dataset Data does not represent the current process conditions 250 Data Pre-processing Data order The data is not ordered by date-time 70 Data Pre-processing Clean dataset No clarity on how the data was processed before using it for analysis 49 Data Pre-processing Test/Training datasets The sampling is not repeatable 70 The failure mode that has the highest priority is that the overall test result is not consistent, impacting the effectiveness of the test but also its efficiency because extra analysis is performed to ensure the good quality of the devices. The definition of the limits is relevant not only to the accuracy of the tests and the overall result, but also to its efficiency, because in the application one faulty characteristic of the device could be detected by more than one test in the sequence, but the earlier the fault is detected, the shorter the length of the test procedure. In Section 4 we present a methodology proposed to improve the definition of the tests. It was automated using a Python script implemented in a Jupiter notebook. +Another failure mode with high priority is to avoid using out-of-date data for data analysis because the model would not be useful for the current state. This failure mode is relevant because in real applications it is very common that the processes change over time, for instance using new raw materials, updates to the design, or improvements to the manufacturing procedures. The methodology in Section 4 includes a monitoring phase which could be used for data analytics reliability as well. +4. Test Limits Evaluation, Improvement and Monitoring Methodology +The tests limits evaluation and improvement process we propose consists of four main phases: Test Efficiency Evaluation, Test Utility to Improve another Test Evaluation, Re-Define Test Limits, and Limits Monitoring. +4.1. Phase 1: Test Efficiency Evaluation +In this phase the aim is to evaluate each test in the sequence, comparing the data distribution versus test limits for FS-PTx, PS, and FTx samples. +Step 1. Select a Test_x in the Sequence: The earlier in the sequence the better because potentially there is more improvement when finding a fail early in the sequence. +Step 2. Split the Dataset into FS-PTx, PS, FTx: Here FS-PTx contains data of assets that failed the test sequence but in another test different to Test_x, PS contains the data of assets that passed the test sequence, and FTx is the data of assets that fail Test_x. +Step 3. Plot Histograms for FS-PTx, PS, FTx: In the histograms can be visualised how each of these datasets performs versus the Test_x limits, if there is a partition between the three datasets, and if the datasets correspond to the same distribution. +Step 4. Calculate Statistics for FS-PTx, PS, FTx: Descriptive statistics are useful for understanding the datasets. It makes sense to include mean, standard deviation, quartiles, maximum and minimum. +Step 5. Partition Evaluation: Quantify the distance between PS and FTx populations. We propose using the following formulas: +Equation 2. Partition Evaluation around Lower Limit +max(FTx ) + 2 ∗ np.std(PS 0.15 0.85 ) < Tx lower limit +Equation 3. Partition Evaluation around Upper Limit +min(FTx ) − 2 ∗ np.std(PS 0.15 0.85 ) < Tx upper limit +Where FTxbelow ll = {y in FTx | y < Tx lower limit}, FTxabove ul = {y in FTx | y > Tx upper limit}, and PSbetween 0.15 and 0.85 quartiles = {y in PS | y > PS quartile 15% & y < PS quartile 85%}. +Step 6. Is there a Partition Between PS and FS-PTx? Using results of Steps 3 to 5 of this phase, when the answer is positive, the recommendation is to add or update the limits for Test_x. +Step 7. Are PS & FTx Clearly Separated? Using results of Steps 3 to 5 of this phase, when the answer is negative, the recommendation is to reconsider the limits for Test_x. +Step 8. Is FTx Empty? If the data of FS-PTx, PS, FTx are a representative sample, it can be inferred that it is highly probable that Test_x is passed, as a result could be eliminated from the sequence, or reduced the frequency of its execution. +4.2. Phase 2: Test Utility to Improve another Test Evaluation +In this phase the aim is to identify relationships between tests and whether one test could be used to calculate the result of another one. The steps are as follows: +Step 1. Select Test_y in the sequence: Here Test_y is another test in the sequence which is executed after Test_x. +Step 2. Are both continuous variables? If Test_x and Test_y measurements are continuous values, calculate Pearson Correlation Coefficient to quantify its association. If the coefficient is > 0.9 or < -0.9 the conclusion is that both tests are highly associated. +Step 3. Are both discrete variables? If Test_x and Test_y measurements are discrete values, execute a Chi-Square Test to quantify their association. If the p-value is < 0.05 the conclusion is that both tests are highly associated. When the test sequence is run on stop-to-fail scenario, this test cannot be performed, since the dataset contains “pass” and “fail” data for Test_y but only “pass” for Test_x. +When associated Tests are found in Steps 2 and 3, sometimes the association between them could be used to estimate the value of Test_y instead of performing the reading. As a result, the test sequence potentially could be reduced. +4.3. Phase 3: Re-Define a Test Limit +In this phase, the results of previous phases are summarised and joined after solving possible conflicts, followed by the implementation and documentation of changes. The details as follows: +Step 1. Improvements Summary: Summarise the recommendations from Phase 1 and 2. +Step 2. Feasibility Evaluation: Evaluate if the new test limits are correct from customer and engineering point of view. +Step 3. Conflict Evaluation: Also evaluate if the recommendations are not in conflict, otherwise evaluate which is the recommendation that generates more improvement. +Step 4. Update Test Limits Definition: The automated test sequence should be updated with the new test limits definition. It is likely that this motivates a new software version, which may need to be certified as part of software quality processes. +Step 5. Document Changes: We recommend that these changes and verifications to be documented on the DVV-FMEA to have all information related to data quality improvement in a single document. +4.4. Phase 4: Limits Monitoring +The objective of this phase is to continuously evaluate whether the new limits are valid, or a re- definition is needed. +Step 1. Metrics Definition: It is relevant to select the most representative metrics to monitor, and it makes sense to choose only a few and to prefer the ones which are easy to measure. +Step 2. Continuous Monitoring: We recommend using statistical process control charts to monitor the key metrics. To keep the manufacturing process as simple as possible, it makes sense to have a small list of key elements to monitor, and also to automate this step, and consider automated flags or warnings when the key elements are not in control. +Step 3. Maintenance: Whenever any of the key monitored parameters are not in control it is time to revisit Phases 1 to 5 of this methodology. +5. Test_80 Evaluation and Improvement +In this subsection the methodology we proposed in previous section is illustrated using the Test_80, which is part of the test sequence analysed in the DVV-FMEA we included in Section 3. +In Figure 2 the histograms of assets that passed the test and in Figure 3 the histogram of assets that failed the test. In both figures, the upper and lower limits of Test_80 are indicated in vertical lines. + +Figure 2. Histograms of Assets that Passed Test_80 Figure 3. Histogram of Assets that Failed Test_80 +Table 3. Statistics of Test_80 Samples +Statistics PS FS-PT80 FT80 Count 171131 39846 368 Mean 2.090 2.089 1.694 Std 0.006 0.010 0.432 Min 2.057 1.996 -0.140 25% 2.085 2.085 1.470 50% 2.088 2.089 1.473 75% 2.097 2.096 1.949 Max 2.104 2.104 2.697 From the histograms we can note that FS-PT80, PS and FT80 populations are not clearly separated. They are close around Test_80's lower limit. In addition, most of the assets, which failed Test_80, are near its lower limit. The statistics in Table 3 are in line with this conclusion. Furthermore, the results of the partition evaluation recommend re-defining the Test_80 lower limit. +Following with the methodology, every test in the sequence was evaluated as stated in Phase 2. We found that there is a linear relation between Test_80 and Test_220. Furthermore, all are faulty assets when Test_80 < 2.05 & Test_220 > 2.05. Also, when Test_220 < 1.95 (Fig. 4). +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +www.aetic.theiaer.org diff --git a/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt b/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt new file mode 100644 index 0000000..920dedb --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/76-Software Quality in the Era of Big Data, IoT and Smart Cities.txt @@ -0,0 +1,186 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Chapter 21 +Software Quality in the Era of Big Data, IoT and Smart Cities +Fatmah Yousef Assiri and Rashid Mehmood +21.1 Introduction +Software quality is the degree to which the software conforms to its requirements. General software quality attributes include testability, maintainability, efficiency, and reliability. One important aspect of software quality is software correctness, which concerns how well the program provides the required functionalities, as defined by its specifications, and can be achieved through software testing and debugging. Software testing is a dynamic process that executes the software under study using a set of test inputs to ensure its outputs meet the users’ expectations. If the software behavior fails to perform as expected, software debugging is performed, which involves checking the code to determine the cause of failures and fixing them. +Software testing and debugging are time-consuming. Studies show that soft- ware debugging and testing form between 50 and 70% of the total development cycle [41]. Software testing involves comparing a set of test inputs and expected results to the actual software outputs. If the software outputs fail to match the expected ones, a fault is detected and the software must be checked for errors. Code is debugged to locate faults and fix them. As requirements change, the software is tested again to ensure that it continues to return the expected behavior, and additional tests are written to test any new requirements; however, writing new tests is not a trivial process. +F. Y. Assiri ( ) +College of Computer Science and Engineering, University of Jeddah, Jeddah, Saudi Arabia e-mail: fyassiri@uj.edu.sa +R. Mehmood +High Performance Computing Center, King Abdulaziz University, Jeddah, Saudi Arabia e-mail: RMehmood@kau.edu.sa +© Springer Nature Switzerland AG 2020 519 +R. Mehmood et al. (eds.), Smart Infrastructure and Applications, EAI/Springer Innovations in Communication and Computing, https://doi.org/10.1007/978-3-030-13705-2_21 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +21 Software Quality in the Era of Big Data, IoT and Smart Cities 521 +The complexity of software is on the rise with the developments of smart cities. Smart cities are driven by, or involve, integration of multiple city systems, such as transport and healthcare, with the aim to provide its citizens a high quality of life [76], see, e.g., [72] for motivations of smart cities and societies. Integrating multiple complex systems causes an increase in the complexity of the underlying software interactions and leads to a higher software complexity. This in turn makes the software quality a bigger challenge. +Relatedly, big data and Internet of Things (IoT) are driving radical changes in smart cities designs, and hence, the software systems landscape. Big data “refers to the emerging technologies that are designed to extract value from data having four Vs characteristics; volume, variety, velocity and veracity [71].” The Internet of Things (IoT) becomes one of the key technological developments of our times that we are able to realize its full potential; it is expected to be a major producer of big data [5]. IoT is defined as “a global infrastructure for the information society, enabling advanced services by interconnecting (physical and virtual) things based on existing and evolving interoperable information and communication technologies [81].” +Together, big data, IoT, smart cities, and other emerging complex applications have exacerbated the challenges of maintaining software quality. The big data produced by IoT and other sources is used in designing or operating various software machines and systems. Since the data is uncertain (i.e., the veracity characteristic), it could lead to inaccurate or faulty system behavior. For example, a computed tomography (CT) scan based on inaccurate machine behavior, or inaccurate data, may give a false positive result for cancer. A wearable device may analyze the data of a diabetic patient incorrectly, giving false negative results, leading to no insulin dose for a patient who actually needed a high dose of insulin. Automatic surgery machines, autonomous vehicles, and spaceships all are examples of critical software with high software and data quality requirements. Moreover, data is being used by organizations to develop strategies, policies, and operations; inaccurate data could lead to disastrous outcomes for these organizations and even for the whole national or global economy. +The aim of this paper is to review the technologies related to software quality in the era of big data, IoT, and smart cities. We elaborate on software quality processes, software testing and debugging. Model checking is discussed with some thoughts on the role it could play in the big data era and the benefits it could gain from big data. The role of big data in software quality is explored. Conclusion is drawn to suggest future directions. +The remainder of the paper is structured as follows. Section 21.2 discusses software quality, software testing and debugging. Section 21.3 discusses model checking. Section 21.4 introduces big data and reviews some related work. Sec- tion 21.5 presents a review of the work that applies data mining techniques to utilize available data to improve software quality. Section 21.6 concludes the paper. +21.2 Software Quality +Software quality is the degree to which the software conforms to a set of require- ments that meet the design specification and the users’ expectations. Quality can be viewed and evaluated from the aspects of function, structure, and process [26]. Functional quality concerns the conformance of the tasks to the users’ required functionalities, with few defects as possible. Structural quality relates to the quality of the written code and can be measured by code maintainability, testability, and understandability. Process quality relates to the development process such as meeting the delivery deadlines and budgets. These three aspects of software quality interleave and thus affect each other. +Software testing and debugging are among the main activities in the development cycle that guarantee the quality of the developed software. Software testing is a validation process that is conducted to ensure that the software meets its specifications, and software debugging is the process of analyzing the code to locate errors that caused the software to fail and correcting them [41]. In Sects. 21.2.1 and 21.2.2, we explain the work that has been done in both areas. +21.2.1 Software Testing +Testing, which is among the main steps in the software development life cycle to ensure software quality, involves executing a set of input values and checking their outputs to validate that the software meets its requirements and intended usage[10]. Testing is a dynamic process performed by observing the software execution. If the resulting output differs from the expected results, a fault is detected. The process of finding these faults and correcting them is called debugging. +Testing can be done at different levels depending on the phase that has been performed. Unit testing evaluates the software at the implementation phase and tests each unit separately. Units can be an individual element of the software such as a method or a class. System and integration testing are performed when the system is complete. System testing verifies that the whole system meets the design specifications, and integration testing checks that the subsystems (group of units) integrate correctly. +Software testing is divided into black-box and white-box testing. Black-box test- ing examines the application functionalities without looking to internal structures. Black-box testing creates tests from the software requirements and specifications; one form of applying it is through the equivalence class partitioning in which the program behaves the same for each set of input values; each set is called a class. For example, the program should retain the same output values for all positive number, thus the set of positive number is considered a class, and the program should be tested with exactly one value of each class. +White-box testing (also known as structural testing) is a method of testing software functionalities (internal structure), and it can be applied through unit and system testing. Tests performed by the software development team are called alpha testing, and those performed by the customer are called beta testing. Beta testing is also a form of black-box testing [79]. +Tests consist of a set of test cases. Each test case consists of input values and a test oracle, which compares the expected output with the actual output to determine whether a program has failed or not [20]. To overcome the problem of having no oracles or the time-consuming process of writing them [94], metamorphic testing was introduced [28, 97]. Metamorphic testing creates follow-up test cases from a set of initial test cases using metamorphic relations. For example, if the initial test evaluates the power function f(x) = ex and the value of x is (3), then e2 is equal to value (let’s assume its (8) ). Metamorphic testing creates another test case which is the value of a is (− 2), and the output is (1/8). The metamorphic relation (MR) is used to check the outputs of the two tests. In this case, MR is that output of first test case (8) + the output of the second test case (1/8) is equal to (1). If MR does not satisfy, a failure is detected. +Mutation testing is an alternative testing approach which was designed to assess the quality of the test cases [35, 46]. Mutation testing creates a copy of the original program, called a mutant, with a seeded fault. The faults are a simple syntax change injected to the code [61, 80]. Tests are executed and the fault is detected if the output of the mutant is different from the output of the original program. Mutation testing computes a mutation adequacy score, which represents the number of detected faults over the total number of seeded faults. A higher score indicates a higher quality of the test sets. MuJava tool was developed to perform automated mutation testing by generating mutants and computing the adequacy score for a set of JUnit tests [62]. +Software testing is labor intensive; thus, to reduce the costs, many automation techniques were developed to automate the generation of test data and test ora- cles [22, 23, 36, 55, 74, 90]. +21.2.2 Software Debugging +Software debugging is a diagnosis process for locating and fixing errors that cause software to fail. Fault localization (FL) techniques were introduced to locate statements in source code that are more likely to contain faults. FL computes a suspiciousness score for each statement, and the computed score indicates the probability that a statement contains a fault. +Spectrum-based FL (SBFL) [1, 4, 18, 29, 32, 49, 86], which is a common FL approach, is a dynamic process that counts the number of passed and failed tests executed for each statement and computes a suspiciousness score for each statement. Statements executed during a failed run are considered to be more likely to contain faults and are thus assigned a higher suspiciousness score than other statements. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +21 Software Quality in the Era of Big Data, IoT and Smart Cities 523 +Table 21.1 The dynamic behavior of the faulty program gcd when executed against tests in T1, ..., T5. Sus. Score is the suspiciousness score computed using Tarantula + +Stmt T1 T2 T3 T4 T5 gcd (int a, int b) { +if(a < 0) //fault +{ printf(“%g \n”, b); +return 0 ; } while(b ! = 0) +if(a > b) +a = a − b ; +else +b = b − a ; printf(“%g \n”, a) ; return 0 ; +} x x +x x x x x x x x +x x x x +x x x x x x x x x x +x x +x x Stmt ID +Sus. Score +1 2 3 4 5 6 7 8 9 10 +1.00 0.00 0.00 0.50 0.57 0.00 0.57 0.57 0.00 0.00 +Many heuristics have been proposed to compute statement suspiciousness scores [1, 4, 48, 49, 77, 86]. +To illustrate how FL techniques order statements based on the likelihood they contain faults, we used the C program shown in Table 21.1 that is adapted from [47]. The program computes the Euclid’s greatest common divisor. This example used four passed tests: T1, T2, T3, and T4, and one failed test: T5. To compute the suspiciousness score, we applied the Tarantula heuristic (Eq. (21.1)). To reduce the time of performing this step, many tools have been developed to automate other parts of testing, such as the FL techniques [45, 47, 83]. +%FailedT ests(s) +susp_T urantula(s) = (21.1) +%PassedT ests(s) + %FailedT ests(s) +The debugging process also involves fixing located faults. Although this was traditionally a manual process, automated program repair (APR) techniques were developed to automate the process [52, 53, 59, 63, 78]. APR techniques take a faulty program and conduct a set of repair tests to produce a repaired program. Figure 21.1 describes the overall structure of the APR techniques. The APR technique applies an FL technique to create a list of potentially faulty statement (LPFS) that is ordered based on their likelihood of containing fault, creates a copy of the original program with one inserted change called a variant, and validates the created variant to check whether or not the fault is fixed. +To create the variants, a set of program modification operators (PMOs) are applied to change the code in the faulty statement generating the variant. PMOs are selected randomly or in order based on the applied search algorithm. Then, each variant is validated by executing it on a set of test cases, regression tests, or formal specifications. The variant is considered a potential repair or potential repaired program if it passes all the tests used in the process. The generated repair + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +524 +Fig. 21.1 Overall automated program repair (APR) technique adapted from [15] +F. Y. Assiri and R. Mehmood + + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +is considered a potential repair, rather than a validated repair, because it is a repair with respect to the selected set of tests used in the process of fixing the faults. The repair is only considered a valid repair when it passes a set of tests (often regression tests) that were not included in the repair process. +Many researchers have contributed to improve the APR process and the quality of generate repairs. Debroy and Wong [33, 34] proposed using mutations through a brute-force search and an FL technique to automate fault fixing. Nguyen et al. [78] developed SemFix, which is a tool that locates faults using the Tarantula heuristic [49]. Then, symbolic execution and program synthesis were used to fix faults. Program syntheses are applied in a predefined order. Wei et al. [91] fix faults using Eiffel programs equipped with contracts, and Kim et al. [53] repaired faults by creating fix templates using 10 built-in patterns that were developed based on common patches written by humans. Weimer et al. [92] developed a weighting scheme to locate faults and applied an evolutionary algorithm to fix faults. APR techniques are also used to fix faults for executable software [25, 82]. Evolutionary computing and genetic programming have been adapted to repair faults in C software [38, 59, 92, 93], Java [12, 52], and Python [2], and to help satisfy non- functional requirements [13, 95]. +The state-of-the-art APR technique is GenProg tool, which uses genetic pro- gramming to modify a program until it finds a variant that passes all the repair test [38, 59, 92, 93]. GenProg was used to successfully fix the Microsoft Zune bug date error, which froze Microsoft devices in 2008 due to an infinite loop that occurred on the last day of a leap year [75]. However, repairs generated using GenProg were hard to read and it only performed potential repairs since they failed when they were executed on a set of regression tests. Assiri and Bieman [15–17] proposed using first-order mutations with a stochastic search algorithm to generate repairs that are similar to efficient ones written by humans. +Even though debugging activities (locating and fixing faults) have been auto- mated to reduce debugging costs, there are many new challenges particularly with big data because it runs largely on parallel cloud computing platforms, making + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +21 Software Quality in the Era of Big Data, IoT and Smart Cities 535 +it error prone and inefficient. Researchers have developed debugging tools to overcome these problems. +BigDebug is an interactive debugging tool that allows developers to set break- points to inspect program states during program execution [40]. BigDebug also provides guarded watchpoints, which return a set of records that satisfy a given condition. BigDebug, which provides backward and forward tracking and allows developers to fix faults and resume execution, improves the performance, avoids having to start the execution from the beginning, and reduces the locations should be checked for failures. +Considerable research has developed debugging tools for distributed systems. However, these typically depend on the use of a single frontend that controls many backend debuggers, which slows the process when used for large-scale distributed systems. Mehmood et al. [70] improved the structure of debuggers to scale them to large systems. The proposed debugging tool follows a hierarchical approach by using intermediate backend servers for a limited number of processes (Fig. 21.2), which evaluate assertions on the connected processes and report violations. This method improves the FL and system overall traffic, making it a suitable approach for large-scale distributed systems. +An alternative method for debugging a distributed system is to perform the debugging at higher-abstraction level than the unit level [21]. When performed at the system level, system behavior is translated into a set of events that are filtered to remove all events that are not of interest to the user. Event sequences are then clustered to create one single event that is used to identify the cause of failures in complex distributed systems. Event definition language (EDL) is used to define a set of events based on a combination of previously determined events. Events are compiled and interpreted to determine the cause of the failures. +Fig. 21.2 PDB architecture adapted from [70] +Debugging tools rely on setting breakpoints or sets of slices to check the software’s behavior. Thus, if the specified locations of the variables do not contain the cause of the errors, the tools will be unable to identify the faulty code. Andrew and Myers developed the Whyline tool [54], an interactive debugging tool that allows developers to ask questions for a given output. Whyline records execution traces for each event and each execution trace has a specific trace file. Then, an output history is created for all stored events. When a class is loaded, Whyline runs an algorithm that depends on data dependencies to identify all variables and fields affected by the output. After identifying the codes responsible for the specified output, the tool generates questions using static and dynamic methods. Two questions are asked: why did and why did not. The first question is answered using the dynamic slicing technique and the latter is answered by investigating each instruction individually. The evaluation study found that using Whyline improved the debugging time for novice programmers, but it suffers from performance issues. +21.3 Model Checking +Model checking is a verification method that is performed to ensure program correctness by investigating all possible software internal states. Model checking requires a complete and clear set of properties that describes what the system should and should not do. The software states are checked against the specified properties. If a violation is found, counterexamples to the execution paths that caused the violation are generated. Model checking has been used to debug many systems such as airline reservation and e-commerce systems [19]. +Model checking has also been used to automate software testing (see Callahan et al. [24]). White-box testing, which concerns the software’s internal representation through the investigation of execution traces for intermediate values, detects errors if an inconsistency exists between the actual and expected values. Specification- based testing, which uses model checking techniques, was proposed to validate and generate tests during the software evolutionary process. In this method, a computation tree comprising all possible execution paths is generated and searched to ensure that all paths follow the specified constraints. +Even though the work by Callahan et al. [24] used a model checker to generate test cases automatically, Amman et al. [9, 11] proposed using a model checker to generate mutation-adequate test cases by adapting mutation testing. Model checking is used widely to write and validate specifications. The proposed combination of model checking and mutation testing addresses the limitation of automatic test generation and mutation testing at the system level. System specifications are converted into a format used by the model checker using a modeling tool. Then, the generated specifications are mutated and used by the model checker to create counterexamples, which are used to automatically generate test cases. Tests are executed and the results and coverage are reported. +For test generation, the SPIN model checker [44] is used to identify execution trace paths for a specified property. Paths are validated and divided into partitions based on a defined set of requirements; each partition, which is called a coverage property, consists of a set of execution paths. Test templates, comprising actual test sequences, are generated using SPIN and are used to create invalid coverage properties to force the program to fail. +Formal methods, such as software cost reduction (SCR), have been used to improve software quality. SCR reduces the development cost since it helps to detect violations at an early stage in the software life cycle before the implementation [39]. SCR uses requirements to generate test sequences that consist of a set of input values and a set of output values for each input. The input values are validated by checking the set of constraints that are specified through the requirement specifications. Then, the test sequences are divided into equivalent partitions and test inputs are generated for all partitions. +Model checking relies on building models of the actual systems and then verifying the models, and therefore, big data technologies can be used to automate the process of model building. Big data technologies could also improve the quality of models that are built before being model checked. Alternatively, model checking can be applied to address the veracity challenges of big data. +While model checking has been very successful in verifying real-life systems, its biggest hurdle is the state-space explosion problem. Researchers have developed various techniques to address this challenge. These include, among others, the use of high performance computing techniques, see, e.g., [66, 67, 69]. +21.4 Big Data +Big data is a relatively new research area that has been utilized in many fields such as online retail stores, decision-making, and scientific research [27]. Big data is defined variously in the literature: some researchers define it using the 3Vs: volume, velocity, and variety [56]. Volume relates to the size of the data, velocity is the speed of the data stream, and variety refers to the data types. Other researchers define big data using 4Vs, with the forth V referring to value, variability, or virtual [98]. Fen and Befit defined big data as the 3Vs plus two more: variability (data interpretation) and value (making decisions) [37]. We consider the definition where volume, variety, velocity, and veracity are used as the 4Vs of big data [71], and consider veracity, as many have noted, to be the biggest challenge of big data. +Big data applications can be used in business, technology, health, and smart cities. Big data can be used to improve quality of life. Data have been used in online retail stores, such as Amazon, to identify user preferences. Algorithms collect information about the users’ preferences based on their actions [65]. In addition, the amount of healthcare data is increasing and is expected to reach a zettabyte in the near future in the USA [85]. Using this medical data will benefit individuals’ health by enabling doctors to detect diseases at the early stages and determine treatments, recovery options, and risks. For additional works on big data in context of smart cities, see [6, 7, 14, 68, 73, 88]. +21.5 Big Data and Software Quality +Data can be used as a validity tool to ensure software correctness, build rec- ommender systems, and predict future actions. Big data has been utilized in many sectors such as healthcare, banking, and transportation. Data are processed using data mining techniques to determine trends and to help in decision-making. Software quality can be related to big data in at least two ways. Firstly, big data can help develop better software quality techniques. Secondly, software quality techniques are needed to improve the quality of big data software and possibly deal with the big data veracity challenge. +With respect to software quality, existing work has applied data mining tech- niques to analyze data repositories, fix faults, determine trends, and automate test generation. +21.5.1 Mining Big Data +Data mining is performed to analyze large amounts of data to understand trends in the data and support decision-making [42]. Software intelligence (SI) is a new field of mining software data to help practitioners in daily decision-making processes, such as when to release the system, what part of the system to test, and/or what part to change [43]. +Mining software repositories is a research direction that analyzes data repos- itories to obtain useful information about systems and projects. The types of repositories include historical repositories that show project progress; run-time repositories, which show system usage on deployment sites; and code repositories, which contain the code for software versions. Linking code repositories and bug repositories can provide a method for warning practitioners about bugs and risky codes. +Lin and Ryaboy analyzed Twitter data using data mining tools; however, due to the limitations of existing tools, the analysis was not a straightforward process [60]. In [89], the researchers mined heterogeneous information using the semantics of node types and the links between them in the networks. The researchers in [51] studied the potential of mining big graphs and found the PEGASUS tool to be a promising approach since it finds anomalous in the large Twitter connected graphs. Last, the authors in [8] focused on mining a large stream of Netflix Prize data to personalize recommendations. To improve the probabilities of customers selections, a lot of factors and more data need to be considered. +The authors in [50] used mining bug reports to develop the BugMiner tool, which uses the support vector machines (SVM) machine learning technique to perform a completion check and a redundancy check on new reports and estimate bug report trends (e.g., incident rate over time) of bug report databases using natural language processing. SVM used the historic reports to train the model to fill any missing fields. For any given report, the tool checks if it already exists by applying similarity ranking using cosine similarity, and Weibull distribution uses historicdata to estimate the number of bug reports received during a specified period (weeks or months) after the start of the project. The experimental results showed that BugMiner was effective in terms of bug reports completion, redundancy, and finding trends. The authors suggest combining the tool with other bug tracking tools to create advanced intelligent software. +Mining software is also used to develop a repair model in the area of APR [64]. In their paper, the authors mine software repositories by investigating developers comments to generate repair actions that can be used later to fix faults. Repair actions can be in the form of adding a method call or changing the condition of if statements. Repair actions are then assigned different probabilities that are also learned from the repositories. To collect fixes from repositories, the authors used data set of 14 repositories and checked the differences between transitions at the abstract syntax tree (AST) level. A difference algorithm was used to produce the set of changes between each pair of Java files. The authors generated 41 change types and 137 possible change type entity types. The empirical study found that 28% of the changes were statement insertions, 23% were statement deletions, and 23% were statement updates. However, the change type statement insert was composed of many entity types, e.g., insert method invocation, if conditional, insert new variable. The results showed that the probability distribution of change type is project independent. +To repair faults, the authors of [64] created a repair model and used different approaches to compute the probabilities of each repair action. The repair shape, which is a set of all possible combinations of repair actions, was then created. The search space is a combination of fault space, repair shapes, and the concrete repair actions that create the shape. +In [96], the authors mined software repositories to study the co-evolution of the production code and test code. Repository histories and log messages were analyzed; however, the results found no matching between changes in the production code and the test. In other words, the test codes remained the same after changing the production code. The test coverage also dropped since no new test was created to guarantee the coverage of the new boundary values. Despite the notable finding, the study failed to specify which data mining techniques were used to check the repositories. +Data mining algorithms are used to automatically induce missing functional requirements from data executions [58]. This approach can help to recover missing and incomplete specifications, design regression tests, and evaluate the correct- ness of software. Creating up-to-date regression tests is difficult, especially with legacy systems. One way to create regression tests is to identify the input–output relationships to write the requirements of the existing system. In [57], the authors proposed to identify the input–output relationships automatically using info-fuzzy networks (IFN), and they evaluated the effectiveness of IFN methodology on complex systems. The experimental results found that the data mining methods are effective for generating tests automatically without needing humans or complete sets of requirements since functional requirements are learned from data execution. +This study compares two approaches of automated construction of oracle: artificial neural networks (ANNs) and IFNs [3]. ANNs have been used to generate a minimal set of tests that are effective at revealing faults [57, 87]. To generate oracles automatically, the following three steps are performed: (1) the training phase, where the system is given positive oracles; (2) the evaluation phase, which accepts positive oracles and rejects negative ones; and (3) the decision phase in which the trained oracles identify correct test cases from unlabeled ones. The experimental results found that IFN would be more appropriate for testing applications that are at the early stages. However, ANNs appear to be better at identifying hard-to-detect faults. +Data mining techniques have been adapted to troubleshoot distributed sys- tems [30]. The goal of this approach is to identify which resources properties would succeed or fail for specific jobs. To demonstrate this approach, the job and machine features for 1000 jobs were extracted, and the job status was described as either a success or failure. Then, two data mining techniques were applied to generate a prediction model: C4.5 decision tree [84] and RIPPER rule-based classification algorithm [31]. Even though both methods predicted that the same features would cause the failures, RIPPER was found to be a more robust and promising method. While other data mining techniques, such as the lazy learning technique, can be applied, they tend to require more information before drawing the model. Additional research is needed to examine more internal or external features. +21.6 Summary, Conclusions, and Future Work +Software quality is the degree to which the software conforms to its requirements. General software quality attributes include testability, maintainability, efficiency, and reliability. One important aspect of software quality is software correctness, which concerns how well the program provides the required functionalities, as defined by its specifications, and can be achieved through software testing and debugging. The complexity of software is on the rise with the developments of smart cities due to the complex nature of these applications and environments. Big data and Internet of Things (IoT) are driving radical changes in the software systems landscape. Together, big data, IoT, smart cities, and other emerging complex applications have exacerbated the challenges of maintaining software quality. +The big data produced by IoT and other sources is used in designing or operating various software machines and systems. Since the data is uncertain (i.e., the veracity characteristic), it could lead to inaccurate or faulty system behavior. In this paper, we reviewed the technologies related to software quality in the era of big data, IoT, and smart cities. We elaborated on software quality processes, software testing and debugging. Model checking was discussed with some directions on the role it could play in the big data era and the benefits it could gain from big data. The role of big data in software quality was explored. +We discussed that software quality can be related to big data in at least two ways. Firstly, big data can help develop better software quality techniques. Secondly, software quality techniques are needed to improve the quality of big data software and possibly deal with the big data veracity challenge. We also highlighted that big data technologies can be used to automate the process of model building as part of the model checking process. Big data technologies could also improve the quality of models that are built before being model checked. Alternatively, model checking can be applied to address the veracity challenges of big data. As mentioned that the biggest hurdle of model checking is the state-space explosion problem that could be addressed using high performance computing techniques. +Our future work will focus on bringing together cutting-edge software quality and big data techniques to develop novel techniques for improving software and data quality of smart city systems. +References +1. Abreu, R., Zoeteweij, P., Van Gemund, A.J.: On the accuracy of spectrum-based fault local- ization. In: Testing: Academic and Industrial Conference Practice and Research Techniques- MUTATION, 2007. TAICPART-MUTATION 2007, pp. 89–98. IEEE, Piscataway (2007) +2. Ackling, T., Alexander, B., Grunert, I.: Evolving patches for software repair. In: Proceedings of the 13th Annual Conference on Genetic and Evolutionary Computation, GECCO ’11, pp. 1427–1434. ACM, New York (2011) +3. Agarwal, D.: A comparative study of artificial neural networks and info fuzzy networks on their use in software testing. Master’s Thesis, University of South Florida (2004) +4. Agrawal, H., Horgan, J.R., London, S., Wong, W.E.: Fault localization using execution slices and dataflow tests. In: Proceedings of the Sixth International Symposium on Software Reliability Engineering, pp. 143–151. IEEE, Piscataway (1995) +5. Alam, F., Mehmood, R., Katib, I., Albeshri, A.: Analysis of eight data mining algo- rithms for smarter internet of things (IOT). Procedia Comput. Sci. 98, 437–442 (2016). https://doi.org/10.1016/j.procs.2016.09.068. http://www.sciencedirect.com/science/article/pii/ S187705091632213X. The 7th International Conference on Emerging Ubiquitous Systems and Pervasive Networks (EUSPN 2016)/The 6th International Conference on Current and Future Trends of Information and Communication Technologies in Healthcare (ICTH-2016)/Affiliated Workshops +6. Alomari, E., Mehmood, R.: Analysis of Tweets in Arabic Language for Detection of Road Traffic Conditions, pp. 98–110. Springer, Cham (2018). https://doi.org/10.1007/978-3-319- 94180-6_12. http://link.springer.com/10.1007/978-3-319-94180-6_12 +7. Alotaibi, S., Mehmood, R.: Big Data Enabled Healthcare Supply Chain Management: Oppor- tunities and Challenges, pp. 207–215. Springer, Cham (2018). https://doi.org/10.1007/978-3- 319-94180-6_21. http://link.springer.com/10.1007/978-3-319-94180-6_21 +8. Amatriain, X.: Mining large streams of user data for personalized recommendations. ACM SIGKDD Explor. Newsl. 14(2), 37–48 (2013) +9. Ammann, P.: System testing via mutation analysis of model checking specifications. ACM SIGSOFT Softw. Eng. Notes 25(1), 33 (2000) +10. Ammann, P., Offutt, J.: Introduction to software testing, Cambridge University Press, Cam- bridge (2016) +11. Ammann, P.E., Black, P.E., Majurski, W.: Using model checking to generate tests from specifications. In: Proceedings of Second International Conference on Formal Engineering Methods, pp. 46–54. IEEE, Piscataway (1998) +12. Arcuri, A.: On the automation of fixing software bugs. In: Companion of the 30th International Conference on Software Engineering, ICSE Companion ’08, pp. 1003–1006. ACM, New York (2008) +13. Arcuri, A., Yao, X.: A novel co-evolutionary approach to automatic software bug fixing. In: IEEE Congress on Evolutionary Computation, 2008. CEC 2008. (IEEE World Congress on Computational Intelligence), pp. 162–168. IEEE, Piscataway (2008) +14. Arfat, Y., Mehmood, R., Albeshri, A.: Parallel Shortest Path Graph Computations of United States Road Network Data on Apache Spark, pp. 323–336. Springer, Cham (2018). https:// doi.org/10.1007/978-3-319-94180-6_30. http://link.springer.com/10.1007/978-3-319-94180- 6_30 +15. Assiri, F.Y., Bieman, J.M.: An assessment of the quality of automated program operator repair. In: Proceedings of the 2014 ICST Conference, ICST’14, IEEE, Piscataway (2014) +16. Assiri, F.Y., Bieman, J.M.: The impact of search algorithms in automated program repair. Submitted to the 2015 International Conference on Soft Computing and Software Engineering, (SeSe’15) (2015) +17. Assiri, F.Y., Bieman, J.M.: Fault localization for automated program repair: effectiveness, performance, repair correctness. Softw. Qual. J. 25(1), 171–199 (2017) +18. Baah, G.K., Podgurski, A., Harrold, M.J.: The probabilistic program dependence graph and its application to fault diagnosis. IEEE Trans. Softw. Eng. 36(4), 528–545 (2010) +19. Baier, C., Katoen, J.P.: Principles of model checking. MIT Press, Cambridge (2008) +20. Baresi, L., Young, M.: Test oracles. Tech. Rep., Technical Report CIS-TR-01-02, University of Oregon, Dept. of Computer and Information Science, Eugene, Oregon (2001) +21. Bates, P.C., Wileden, J.C.: High-level debugging of distributed systems: the behavioral abstraction approach. J. Syst. Softw. 3(4), 255–264 (1983) +22. Boyapati, C., Khurshid, S., Marinov, D.: Korat: automated testing based on java predicates. In: ACM SIGSOFT Software Engineering Notes, vol. 27, pp. 123–133. ACM, New York (2002) +23. Burdonov, I., Kossatchev, A., Petrenko, A., Galter, D.: Kvest: automated generation of test suites from formal specifications. In: International Symposium on Formal Methods, pp. 608– 621. Springer, Berlin (1999) +24. Callahan, J., Schneider, F., Easterbrook, S., et al.: Automated software testing using model- checking. In: Proceedings 1996 SPIN workshop, vol. 353 (1996) +25. Carzaniga, A., Gorla, A., Mattavelli, A., Perino, N., Pezze, M.: Automatic recovery from run- time failures. In: Proceedings of the 2013 International Conference on Software Engineering, pp. 782–791. IEEE, Piscataway (2013) +26. Chappell, D.: The three aspects of software quality: functional, structural, and process, White Paper. Chappell & Associates, San Francisco, CA. Available at www.davidchappell.com. Last accessed 30 May 2019 +27. Chen, C.P., Zhang, C.Y.: Data-intensive applications, challenges, techniques and technologies: a survey on big data. Inf. Sci. 275, 314–347 (2014) +28. Chen, T.Y., Cheung, S.C., Yiu, S.M.: Metamorphic testing: a new approach for generating next test cases. Tech. Rep., Technical Report HKUST-CS98-01, Department of Computer Science, Hong Kong University of Science and Technology, Hong Kong (1998) +29. Chilimbi, T.M., Liblit, B., Mehra, K., Nori, A.V., Vaswani, K.: Holmes: effective statistical debugging via efficient path profiling. In: IEEE 31st International Conference on Software Engineering, 2009. ICSE 2009, pp. 34–44. IEEE, Piscataway (2009) +30. Cieslak, D.A., Thain, D., Chawla, N.V.: Short paper: troubleshooting distributed systems via data mining. In: 15th IEEE International Symposium on High Performance Distributed Computing, pp. 309–312. IEEE, Piscataway (2006) +31. Cohen, W.W.: Fast effective rule induction. In: Machine Learning Proceedings 1995, pp. 115– 123. Elsevier, Amsterdam (1995) +32. Dallmeier, V., Lindig, C., Zeller, A.: Lightweight defect localization for Java. In: ECOOP 2005- Object-Oriented Programming, pp. 528–550. Springer, Berlin (2005) +33. Debroy, V., Wong, W.E.: Using mutation to automatically suggest fixes for faulty programs. In: Third International Conference on Software Testing, Verification and Validation (ICST), pp. 65–74. IEEE, Piscataway (2010) +34. Debroy, V., Wong, W.E.: Combining mutation and fault localization for automated program debugging. J. Syst. Softw. 90, 45–60 (2014) +35. DeMillo, R.A., Lipton, R.J., Sayward, F.G.: Hints on test data selection: help for the practicing programmer. Computer 11(4), 34–41 (1978) +36. Dick, J., Faivre, A.: Automating the generation and sequencing of test cases from model-based specifications. In: International Symposium of Formal Methods Europe, pp. 268–284. Springer, Berlin (1993) +37. Fan, W., Bifet, A.: Mining big data: current status, and forecast to the future. ACM SIGKDD Explor. Newsl. 14(2), 1–5 (2013) +38. Forrest, S., Nguyen, T., Weimer, W., Le Goues, C.: A genetic programming approach to automated software repair. In: Proceedings of the 11th Annual conference on Genetic and evolutionary computation, GECCO ’09, pp. 947–954. ACM, New York (2009) +39. Gargantini, A., Heitmeyer, C.: Using model checking to generate tests from requirements specifications. In: ACM SIGSOFT Software Engineering Notes, vol. 24, pp. 146–162. Springer, Berlin (1999) +40. Gulzar, M.A., Interlandi, M., Yoo, S., Tetali, S.D., Condie, T., Millstein, T., Kim, M.: Bigdebug: debugging primitives for interactive big data processing in spark. In: Proceedings of the 38th International Conference on Software Engineering, pp. 784–795. ACM, New York (2016) +41. Hailpern, B., Santhanam, P.: Software debugging, testing, and verification. IBM Syst. J. 41(1), 4–12 (2002) +42. Hand, D.J.: Principles of data mining. Drug Saf. 30(7), 621–622 (2007) +43. Hassan, A.E., Xie, T.: Software intelligence: the future of mining software engineering data. In: Proceedings of the FSE/SDP Workshop on Future of Software Engineering Research, pp. 161– 166. ACM, New York (2010) +44. Holzmann, G.J.: Design and Verification of Computer Protocols, Prentice Hall, Upper Saddle River (1991) +45. Janssen, T., Abreu, R., van Gemund, A.J.: Zoltar: A toolset for automatic fault localization. In: Proceedings of the 2009 IEEE/ACM International Conference on Automated Software Engineering, pp. 662–664. IEEE Computer Society, Washington, D.C. (2009) +46. Jia, Y., Harman, M.: An analysis and survey of the development of mutation testing. IEEE Trans. Softw. Eng. 37(5), 649–678 (2011) +47. Jones, J.A., Harrold, M.J.: Empirical evaluation of the Tarantula automatic fault-localization technique. In: Proceedings of the 20th IEEE/ACM international Conference on Automated Software Engineering, pp. 273–282. ACM, New York (2005) +48. Jones, J.A., Harrold, M.J., Stasko, J.T.: Visualization for fault localization. In: Proceedings of ICSE 2001 Workshop on Software Visualization, Toronto, Ontario, pp. 71–75. Citeseer (2001) +49. Jones, J.A., Harrold, M.J., Stasko, J.: Visualization of test information to assist fault localization. In: Proceedings of the 24th International Conference on Software Engineering, pp. 467–477. ACM, New York (2002) +50. Kaiser, L.W.B.X.G., Passonneau, R.: Bugminer: Software reliability analysis via data mining of bug reports. Delta 12(10), 09–0500 (2011) +51. Kang, U., Faloutsos, C.: Big graph mining: algorithms and discoveries. ACM SIGKDD Explor. Newsl. 14(2), 29–36 (2013) +52. Kern, C., Esparza, J.: Automatic error correction of Java programs. In: Proceedings of the 15th International Conference on Formal Methods for Industrial Critical Systems, FMICS’10, pp. 67–81. Springer, Berlin (2010) +53. Kim, D., Nam, J., Song, J., Kim, S.: Automatic patch generation learned from human-written patches. In: Proceedings of the 2013 International Conference on Software Engineering, pp. 802–811. IEEE, Piscataway (2013) +54. Ko, A.J., Myers, B.A.: Debugging reinvented: asking and answering why and why not questions about program behavior. In: Proceedings of the 30th International Conference on Software Engineering, pp. 301–310. ACM, New York (2008) +55. Lamancha, B.P., Polo, M., Caivano, D., Piattini, M., Visaggio, G.: Automated generation of test oracles using a model-driven approach. Inf. Softw. Technol. 55(2), 301–319 (2013) +56. Laney, D.: 3d data management: controlling data volume, velocity and variety. META Group Res. Note 6(70), 1 (2001) +57. Last, M., Kandel, A.: Automated test reduction using an info-fuzzy network. In: Software Engineering with Computational Intelligence, pp. 235–258. Springer, Boston (2003) +58. Last, M., Friedman, M., Kandel, A.: The data mining approach to automated software testing. In: Proceedings of the Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 388–396. ACM, New York (2003) +59. Le Goues, C., Nguyen, T., Forrest, S., Weimer, W.: GenProg: a generic method for automatic software repair. IEEE Trans. Softw. Eng. 38(1), 54–72 (2012) +60. Lin, J., Ryaboy, D.: Scaling big data mining infrastructure: the twitter experience. ACM SIGKDD Explor. Newsl. 14(2), 6–19 (2013) +61. Ma, Y.S., Kwon, Y.R., Offutt, J.: Inter-class mutation operators for java. In: Proceedings of 13th International Symposium on Software Reliability Engineering, 2002. ISSRE 2003, pp. 352– 363. IEEE, Piscataway (2002) +62. Ma, Y.S., Offutt, J., Kwon, Y.R.: Mujava: a mutation system for Java. In: Proceedings of the 28th International Conference on Software Engineering, pp. 827–830. ACM, New York (2006) +63. Martinez, M., Monperrus, M.: Astor: evolutionary automatic software repair for Java. arXiv preprint arXiv:1410.6651 (2014) +64. Martinez, M., Monperrus, M.: Mining software repair models for reasoning on the search space of automated program fixing. Empir. Softw. Eng. 20(1), 176–205 (2015) +65. McAfee, A., Brynjolfsson, E., Davenport, T.H., Patil, D., Barton, D.: Big data: the management revolution. Harv. Bus. Rev. 90(10), 60–68 (2012) +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt b/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt new file mode 100644 index 0000000..9ce75b9 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/77-SAT-ETL-Integratoranextract-transform-loadsoftwareforsatellitebigdataingestion.txt @@ -0,0 +1,115 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +SAT-ETL-Integrator: an extract- transform-load software for satellite big data ingestion +Badr-Eddine Boudriki Semlali Chaker El Amrani Guadalupe Ortiz +Badr-Eddine Boudriki Semlali, Chaker El Amrani, Guadalupe Ortiz, SAT-ETL-Integrator: an extract-transform-load software for satellite big data ingestion, J. Appl. Remote Sens.14(1), 018501 (2020), doi: 10.1117/1.JRS.14.018501 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software... +SAT-ETL-Integrator: an extract-transform-load +software for satellite big data ingestion +Badr-Eddine Boudriki Semlali,a,* Chaker El Amrani,a and +Guadalupe Ortizb +aAbdelmalek Essa di University, LIST Laboratory, Faculty of Sciences and Techniques, +Tangier, Morocco +bUniversity of Cadiz, UCASE Research Group, Escuela Superior de Ingenier a, Cadiz, Spain +Abstract. Satellite data are used in several environmental applications, particularly in air quality supervising, climate change monitoring, and natural disaster predictions. However, remote sensing (RS) data occur in huge volume, in near-real time, and are stored inside complex structures. We aim to prove that satellite data are big data (BD). Accordingly, we propose a software as an extract-transform-load tool for satellite data preprocessing. We focused on the ingestion layer that will enable an efficient RSBD integration. As a result, the developed software layer receives data continuously and removes∼86% of the unused files. This layer also eliminates nearly 20% of erroneous datasets. Thanks to the proposed approach, we successfully reduced storage space consumption, enhanced the RS data accuracy, and integrated preprocessed datasets into a Hadoop distributed file system.' 2020 Society of Photo-Optical Instrumentation Engineers (SPIE) [DOI: 10.1117/1.JRS.14.018501] +Keywords: remote sensing big data; ingestion layer; extract transform load software; data integration. +Paper 190597 received Sep. 5, 2019; accepted for publication Jan. 7, 2020; published online Jan. 25, 2020. +1 Introduction +Recently, the world has witnessed a great rise in industrial, agricultural, and transport activities. This development certainly helps to improve the economic and the social status of countries. But it also causes many environmental issues that affect the quality of human health and the safety of our planet, such as the appearance of the ozone hole, the increase in climate changes, and the degradation of air quality (AQ) by the emission of many anthropogenic pollutants, such as carbon monoxide (CO), carbon dioxide (CO2), nitrogenous oxides (NOx), and methane (CH ).1 Thus remote sensing (RS) techniques are one of the proposed solutions enabling a +4 +near-real-time (NRT) tracking of the pollutant plumes emitted from the industrial and agricul- tural areas,2 ozone precursor estimation, aerosol optical depth (AOD) monitoring, and climate +change monitoring. In addition, they provide a potential input data for AQ models. +Generally, RS technique refers to the use of satellite data to measure ocean, Earth, and atmospheric components without making physical contact with them through the electro- magnetic energy (EME).3 At present, there are more than 3000 satellites in orbit4 used for many purposes, such as military, Earth observation, weather, and forecasting support. All of these satellites are equipped with manyactiveand/or passivesensors within different temporal, spatial, and spectral resolutions ranging from low to very high.5 +Basically, satellite sensors measure data, then the satellite processing unit corrects the erroneous data using specific algorithms including SPECAN and Doppler.6 Afterward, data are +transmitted into ground stations through downlink channels to be distributed into a broadcast or a multicast. +In this study, we collect data from the European Organization for the Exploitation of Meteorological Satellites (EUMETSAT) via the Mediterranean Dialogue Earth Observatory (MDEO) ground station installed at Abdelmalek Essa di University of Tangier in Morocco.7 +*Address all correspondence to Badr-Eddine Boudriki Semlali, E-mail:badreddine.boudrikisemlali@uae.ac.ma 1931-3195/2020/$28.00 ' 2020 SPIE +We also acquired RS data from the Earth Observation System Data and Information System (EOSDIS) of the National Aeronautics and Space Administration (NASA), the Infusing Satellite Data into Environmental Applications (NESDIS) of the National Oceanic and Atmospheric Administration (NOAA), and The Copernicus Open Access Hub (previously known as Sentinels Scientific Data Hub) built and operated by the European Space Agency (ESA), provided complete, free, and open access to Sentinel-1, Sentinel-2, Sentinel-3, and Sentinel-5P user products, starting from the in-orbit commissioning review. The acquired RS data comes from many polar and geostationary satellites and various sensors. +These data are stored in specific complex scientific file extensions: the binary universal form for the representation (BUFR) of meteorological data, the network common data form (NetCDF), and the hierarchical data format (HDF5). The daily volume of the received RS data reaches 40 gigabits (GB) and exceeds 15 terabits (TB) per year. Furthermore, the speed with which data are received is very fast, at a rate of 30,000 files per day. Accordingly, and according to attribute definition (venue, volume, variety, veracity, velocity, and so on), the data may be classified as big data (BD).8 Based on these aforementioned brief statistics, we are going to confirm that satellite data are BD. +Consequently, remote sensing big data (RSBD) turns out to be an extremely challenging problem to be dealt with, including an efficient, rapid, and NRT processing. In addition, RSBD for environmental observation is regarded as a data intensiveprocess because thevolume, complexity, and the velocity exceed the usual processing systems and architectures.9 +For this reason, we have adopted the Hadoop BD architecture to split the problems of RSBD. The proposed design includes six interactives layers, which are the data sources, the ingestion layer, the Hadoop storage, monitoring layer, and the visualization layer. In this paper, we will focus only on the ingestion layer. This phase is very critical because it is responsible to collect unprocessed RS data, to manage enormous volume of input data, to extract, to filter, and to integrate refined RS data into a Hadoop Distributed File System (HDFS). +As a result, the developed extract transform load (ETL) tool has efficiently processed and extracted potential values with high accuracy and with a low storage volume in a moderate execution time. Furthermore, the developed software has performed all steps automatically and processes global RS data. +The remainder of this paper is organized as follows: Secs2, 3, and 4 enumerate, respectively, the issues, the main focus of this paper, and a review of some related works, Sec.5 presents the different aspects and characteristics of RSBD, Sec.6 goes into the details concerning the challenges of RSBD and explains the architecture developed for the ingestion layer, Sec.7 provides the results and discusses the experimental analysis. +2 Issues +RS data are widely used for several environmental applications, particularly in air pollution and climate change monitoring. However, the exploitation of these data contains many challenges, which are as follows: +The specifications of RS data, including the venue, the volume, and the velocity are complex in terms of processing. +Satellite data should be processed in NRT to keep their freshness. +Satellite data sometimes contain errors, gaps, and invalid datasets. It is recommended to remove them before the storage step. +The existing architectures and solutions have some limitations and drawbacks in RS data ingestion. +3 Main Focus of This Paper This study has the following aims. +Understanding the nature and the characteristics of the used satellite data and proofing that we are working with RSBD. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Journal of Applied Remote Sensing 018501-2 Jan Mar 2020 Vol. 14(1) +Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software... +Developing a software as an ingestion layers for RS data integration regarded as similar to an ETL tool which knows from data warehouse. +Storing the refined RS datasets into an HDFS. +4 Background and Related Works +The general architecture of satellite data processing consists of three logical groups of servers: receiving servers, preliminary processing and thematic processing servers, and data storage servers accommodating large daily volume of data. There are some examples of the satellite data receiving platforms as follows: +The Office of Satellite and Product Operation of NOAA. +The EUMETCast service of EUMETSAT. +The ground segment system developed by ESA within the European Remote Sensing program. +The receiving servers collect the data in NRT from satellite without any modules of process- ing. For instance, there are as follows: +The Fairbanks (POES) and the Wallops (GOES) grounds station of NOAA. +The Command and Data Acquisition (Polar system) and the Primary Ground Station (Geostationary system) of EUMETSAT. +The preliminary processing performs radiometric calibration of the received data using spe- cific software such as SPECAN and Doppler. This stage of processing provides data of level 1. We can site some of the existing satellites processing center in the world as follows: +The Satellite Operation Control Center of NOAA. +The Environmental Satellite Processing Center of NOAA. +The Earth Observing System and Operation System of NASA. The Science Data Processing Segment of NASA. +The Central Facility (CF) of EUMETSAT. +The Data Processing Ground Segment of ESA. +Second, the processing server provides refined products, particularly atmospheric chem- istry, atmospheric temperature, humidity, fire, smoke, and so on to the customers through a website interface. These platforms offer to the end users easy online searching, exploring, and filtering based on keyword, satellites, instruments, organizations, projects, processing level, and temporal and/or spatial delimiters. Moreover, they visualize datasets into interactive maps in NRT and make data available for downloading via file transfer protocol (FTP) or hypertext transfer protocol (HTTP) servers. The primary goal of these platforms is to maximize the scientific return for mission, research, and decision makers. All these services are free and open to all users for any scientific purpose. The following list includes some of the pioneer platforms. +The Earth Science Data Systems Program of NASA.10 +The Comprehensive Large Array-data Stewardship System of NOAA.11 The Copernicus Open Access Hub operated by ESA.12 +The Product Navigator of EUMETSAT.13 +The finalstep of processing consists of storing the processed satellite data into data centers as data storage system group. There are four big satellite data centers in the world, which are: +the EOSDIS of NASA, +the NESDIS of NOAA, +the EUMETSAT Data Center, +the European Space Astronomy Centre Science Data Centre. +Currently, RS data are widely used in many scientific disciplines such as environmental and social sciences. This has led to an increase of RS data that will continue to scale exponentially. Thus the processing of the RS data includes many challenges, beginning from the acquisition + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Journal of Applied Remote Sensing 018501-3 Jan Mar 2020 Vol. 14(1) +Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software... +to the visualization step,14 as follows: (1) satellite data are measured in NRT from satellite sensors, then transmitted to ground datacenters through downlinks, so the big protest is how to download these data from their sources within a high speed to keep their freshness. (2) Such data should be preprocessed inside an ingestion layer to be integrated into scalable servers with big storage capacity. (3) The treatment of RS data requires permanent and functional clusters; accordingly, this consumes more energy, so the electrical power should also be economized. (4) It is very possible to find many duplicated datasets, so the elimination of redundancy will help to hold only potential values. (5) In addition, satellite data are pervasive; they generate a huge volume of data with high velocity that storage system cannot continuously host, so it is necessary to remove old RS data by creating a model that decides which data to keep and which to discard. (6) Satellite data include many noisy and erroneous datasets due to the uncer- tainty of sensors. Accordingly, developing an efficient data-refining software will be beneficial for enhancing the satellite data accuracy. (7) RSBD processing demands some knowledge in probability and statistics in order to employ deep learning (DL), machine learning, and neural network algorithms to unlock new insights. +Despite the existing aforementioned strong architectures, platforms, and systems from big organizations such as the NASA, NOAA, EUMETSAT, and the ESA, we can find some lim- itations and challenges of processing. In addition, sometimes their technologies are exceeded by the complexity and the huge volume of the acquired RS data.9 +RSdataprocessingisbecomingasignificantfieldofresearch. Manyinvestigationshavebeen made on different architectures. These research studies aim principally as follows: +To optimize algorithms and processing patterns, JIN Hailiang combined the index and the Hibert curve to establish the index for the image data. Then the method of MapReduce parallel processing was used to write and query RS images. The experimental results showed that the method can effectively improve the data writing and query speed and has good scalability.15 +Toinclude parallel computingtechniques,16 tostoreandprocessRSBD withinadistributed Hadoop platform,17 and to manage RSBD with the streaming processing tools.18 +To propose a combination of streaming and MapReduce for analysis of time series data, they tested their proposal by applying the break detection algorithm BFAST to MODIS imagery. Then they evaluated the computing performance and requirements quality attrib- utes. Their results revealed that the combination of Hadoop and R can handle complex analysis of RS time series. +To come up with an empirical model of DI index to estimate RS applications.9 Muhammad Mazhar designed a real-time BD analytical architecture for RS satellites applications (Rathore et al., 2015). +Winda Astriani performed an ETL model to create multidimensional data cube. The ETL application of using Geokettle expected to provide data warehouse developers with per- forming automatic preprocessing data that allows regulating the insertion of new data and updating data without generating a lot of queries.19 +RS data are regarded as BD according to the attribute definition based to the eight salients (venue, volume, velocity, value, veracity, vocabulary, validity, and variety). So that adopting a BD analytics architecture is very crucial to make the processing efficient, to gain insights, and to make better decisions. +Our study focuses mainly on air pollution and climate change monitoring requiring tremen- dous RS data coming in NRT from many satellites and sensors within different temporal and spatial resolutions (SPRs). The nature of these data is complex and their volume is huge.6 Thus building a BD architecture for RS data will help absolutely in data acquisition, filtering, storage, processing, and visualization. +This paper introduces an ingestion layer as a software system consisting of different com- ponents which fill the gaps between external data sources and the HDFS. This software can be regardedas anETL for raster satellitedata, which allows efficienthandlingof acquired data from several sources and integrating them in an optimized way into an HDFS and separates storage issues from algorithm and application issues. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Journal of Applied Remote Sensing 018501-4 Jan Mar 2020 Vol. 14(1) +Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software... +5 Remote Sensing Big Data: Aspects and Specification +This section describes the characteristics of the satellite data used in terms of volume, velocity, variety, and so on to demonstrate that RS data are BD. +5.1 Satellite Big Data: Aspects and Features +Generally, RS techniques are defined as the technologies measuring the surface, ocean, and atmospheric components without making a physical contact with it through EME20; satellites +are regarded as the key instrument of this technique. +A satellite can be defined as an artificial machine placed into a specific orbit; this orbit can be polar passing by Sun-synchronous orbits (SSO), which combines altitude and inclination in such a way that the satellite passes over any given point of the planet s surface at the same local solar time. Geostationary orbit is placed with an altitude of∼36;000 km directly over the equator and revolves in the same direction that Earth rotates (west to east). At this altitude, one orbit takes 24 h.21 We can cite three types of orbital altitude, which are the low earth orbit (LEO), the medium earth orbit, and the high earth orbit.22 +Satellites are equipped with passivesensors such as LIDAR, RADAR, scatter meter, sounder, and laser altimeter detecting sunlight radiation reflected from the earth and thermal radiation in the visible and infrared of the electromagnetic spectrum. In addition, they do not emit their own radiation but receive natural light and thermal radiation from the Earth s surface. +The second type is the active sensors (e.g., radar and laser scanners) emitting an artificial radiation to monitor the earth surface or atmospheric features. Moreover, they do not depend on daylight and are minimally affected by clouds, dust, fog, wind, and bad weather conditions.5 +Furthermore, satellite sensors have other specifications, particularly the SPR, which means the Earth is surface-scanned by the instrument, ranging from low to very high. +In addition, satellite sensors have a specific frequency to across the same geolocation, called the temporal resolution (TMR), which varies as high, medium, and low TMR. +Satellite sensors continuously measure environmental variables and parameters. Afterward, the satellite processing unit corrects the enormous measured data using some algorithms includ- ing Doppler or SPECAN. This correction concerns the SPR and the geo-localization errors.6 Datawillbetransmittedintoantennasinground stationsthroughdownlink channels.Theground stations process RS data in order to remove imperfections, ensure geometric corrections, and apply data calibrations. This step will generate RS data of level 2 (L2) and level 3 (L3) of processing. +In our research, we aim to apply RS techniques to track pollutant plumes emitted from indus- trial and agricultural activities, detect wildfires, monitor climate changes, and supply Moroccan forecasting agencies in NRTin order to prevent damages and help decision makers. In this inves- tigation, we collect data from the EUMETSAT via the MDEO ground station installed at Abdelmalek Essa di University of Tangier in Morocco.23 We also acquired RS data from the EOSDIS of NOAA, the NESDIS of NOAA, and the Copernicus platform.24 +From the statistical data in Table1 and according to Fig. 1, we can determine that there are manysourcesprovidingRSdatafrom varioussatellites(venue),wherein all ofthesesatellites are for environmental monitoring and meteorological application. These satellites are polar passing by an SSO excepting the geostationary Meteosat second generation (MSG).25 The majority of these satellites were launched in this last decade; for instance, the MetOp B in 2012,26 the Suomi National Polar-orbiting Partnership (NPP) in 2011, Sentinel-3A in 2016, and the Sentinel-5P in 2017.27 The MetOp C will be launched by the 2019. Their TMR is high, making 16 orbits daily within an average of 1 h of latency.28 +In our case study, the acquired RS data are stored in different scientificfile formats, including the BUFR, Binary, NetCDF, and the HDF5 (variety). These files have some special structure and models to store datasets (vocabulary). Furthermore, these channels afford an enormous file in NRT. We notice that the daily rate of MDEO is about 20,000 files, the NESDIS reaches 8000 files, the EOSDIS stretch 7000 files, and the Copernicus produces an average of 200 files (veloc- ity). The total amount of collected volume by the four sources sums up to about 37 GB per day andexceeds14TBperyear(volume).Inaddition,satellitedatahavebecomeveryusefulinmany + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Journal of Applied Remote Sensing 018501-5 Jan Mar 2020 Vol. 14(1) +Semlali, El Amrani, and Ortiz: SAT-ETL-Integrator: an extract-transform-load software... +Table 1 Sources channel and characteristics of the used satellite data in the case study. + +Organization Satellite (sensors) Product name Latency (min) File is format (Files/ day) Data amount (MB/day) Copernicus Sentinel 3 (OLCI) Sentinel-3 15 NetCDF 41 14,000 Copernicus Sentinel5P (TROPOMI) Sentinel-5P 15 NetCDF 8 5 4400 MDEO MetOp (IASI, AMSU) EPS-Africa 30 BUFR, Bin 9000 2200 MDEO MetOp (ATVOS) EPS-Global 30 Bin 1000 180 MDEO MSG (SEVIRI) Data_Channel_3 30 GRIB,HDF5 300 240 MDEO NPP (OMPS, VIIRS) NPP-3 30 NetCDF,Bin 1000 1100 MDEO MetOp (GOME-2) SAF-Africa 30 BUFR, HDF5 2000 700 MDEO MetOp (ASCAT, GOME-2) SAF-Europe 30 BUFR, Bin, +HDF5 5000 3800 NASA AQUA (AIRS) AIRS2SUP_NRT.006 15 HDF5 640 5400 NASA AQUA (AMSU) MCDAODHD 360 HDF5 4 4 NASA AURA (MLS) ML2CO_NRT.004 15 HDF5 90 25 NASA AURA (MLS) This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Journal of Applied Remote Sensing 018501-6 Jan Mar 2020 Vol. 14(1) diff --git a/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt b/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt new file mode 100644 index 0000000..52041a6 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/81-Automated data cleaning of paediatric anthropometric data.txt @@ -0,0 +1,107 @@ +www.nature.com/scientificreports/ www.nature.com/scientificreports +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +www.nature.com/scientificreports + + + + +open Automated data cleaning of +paediatric anthropometric data from longitudinal electronic health records: protocol and application to a large patient cohort +Hang t. t. phan1,2 ✉, Florina Borca2,3, David cable3, James Batchelor1,2, Justin H. Davies3,4 & Sarah ennis1,2,4 +‘Big data’ in healthcare encompass measurements collated from multiple sources with various +degrees of data quality. these data require quality control assessment to optimise quality for clinical management and for robust large-scale data analysis in healthcare research. Height and weight data represent one of the most abundantly recorded health statistics. the shift to electronic recording of anthropometric measurements in electronic healthcare records, has rapidly inflated the number of measurements. WHO guidelines inform removal of population-based extreme outliers but an absence of tools limits cleaning of longitudinal anthropometric measurements. We developed and optimised +a protocol for cleaning paediatric height and weight data that incorporates outlier detection using robust linear regression methodology using a manually curated set of 6,279 patients’ longitudinal measurements. The protocol was then applied to a cohort of 200,000 patient records collected from 60,000 paediatric patients attending a regional teaching hospital in South England. WHO guidelines detected biologically implausible data in <1% of records. Additional error rates of 3% and 0.2% +for height and weight respectively were detected using the protocol. Inflated error rates for height measurements were largely due to small but physiologically implausible decreases in height. Lowest error rates were observed when data was measured and digitally recorded by staff routinely required +to do so. the protocol successfully automates the parsing of implausible and poor quality height and weight data from a voluminous longitudinal dataset and standardises the quality assessment of data for clinical and research applications. +With the availability of digital electronic health systems, ‘big’ clinical data has become more accessible to the research community1,2. The big data era, which includes using data obtained from heterogeneous digital sources, has enabled novel opportunities for conducting empirical clinical research. At the same time there are challenges using such data for research purposes, including the need to adapt existing and develop new methodologies to cope with the scale and complexity of the data3. However, a more fundamental issue for researchers is the require- ment to undertake data cleaning, as incorrect clinical measurements entered into an electronic health record (EHR) will significantly affect the quality of dataset. Data cleaning can be time-consuming and involve multiple stages including detailed data analysis to identify error types, data inconsistencies, outlier detection and imple- ment data transformation where required4,5. Thus, developing automated methods for data cleaning is desirable. +Height and weight are the most commonly recorded anthropometric measures for the assessment of child health in both clinical practice and research studies. Longitudinal height measurements give an indication of well-being and perturbations may be an indication of nutritional, endocrine, cardiac or other abnormalities that should prompt a clinical decision for investigation or intervention. Body mass index (BMI), defined by heights +1NIHR Southampton Biomedical Research Centre, University Hospital Southampton, Southampton, UK. 2University of Southampton, Southampton, UK. 3University Hospital Southampton NHS Foundation Trust, Southampton, UK. +4These authors contributed equally: Justin H. Davies and Sarah Ennis. ✉e-mail: hang.phan@soton.ac.uk +and weights, may be used to establish risks of prevalence of diseases6. In children, longitudinal changes of BMI provide insight into predisposition to health problems such as obesity, hypertension, type 2 diabetes and nutri- tional insufficiency. +World Health Organisation (WHO) guidelines 7 can be used to exclude biologically implausible values (BIV) from the EHR for childhood height, weight and BMI data, by converting the measurements to standard deviation scores (SDS) and using defined parameters to exclude extreme values (e.g. height to age z-score (HAZ) exclusion if < −6 or >6). However, there are few studies which have evaluated methods for cleaning periodical longitu- dinal anthropometric data 8. For example, some have identified BIVs for annual longitudinal values where the mean changes of BMI values exceed 3SDS or −3SDS and height decrements greater than 1 inch/year, and mean increases in height> 3SDS9,10. Others10 have suggested removing weight measurements where annual changes exceed 22.7 kg or 27.2 kg if the individual was severely obese at baseline, any height decrease and any height increase > 15 cm a year. These methods were developed for identifying extreme changes in periodical measure- ments and do not detect less extreme changes and so are not applicable to children where growth is dynamic. Neither are they applicable to the big-data scenario where anthropometric measurements are non-periodical. More recently the jack-knife residual method, applicable to paediatric patients with ≥4 datapoints, was suggested and applied to a paediatric anthropometric dataset for children ≤2 years old11. Although simple to use, it can be too strict in defining the range of plausible values hence not allowing more pronounced fluctuations in longitudi- nal data that are typical in the paediatric clinical setting where an individual can reduce or gain significant weight during or after a treatment period12,13. +University Hospital Southampton (UHS) is a large teaching and research hospital serving a population of nearly 3.5 to 4 million people in South Hampshire. The Southampton Children’s Hospital of UHS initiated elec- tronical recording of anthropometric measurements in 2012 and subsequently developed an Electronic Growth Chart (EGC) which was rolled out for use across departments in the hospital in 201314. Since then, anthropomet- ric data on children has been systematically recorded, improving the accuracy of growth data presentation on a growth chart and enhancing the experience of sharing growth data by clinicians between paediatric specialities. It has also presented an opportunity for research studies to use longitudinal routine patient care anthropomet- ric data and make correlations between childhood growth and development of disease or efficacy of therapy. However, data recorded for routine clinical care by end-users can be prone to typographical or default value entry errors often related to time pressure for care delivery. Hence it is necessary that the anthropometric data be cleaned and processed before it is used for research purposes. +In this study, we developed an automated protocol for identifying outliers of longitudinal routine paediatric height and weight measurements using state-of-the-art outlier detection methods. Concurrently, a subset of UHS electronic paediatric height and weight data of patients aged 2–20 years old, the gold-standard dataset manual curated for parameter optimisation, were assessed for data quality. We demonstrate how dataset scrutiny can identify and target training needs in anthropometric assessment in a teaching hospital. +Materials and methods +Anthropometric data scope and extraction. Electronically recorded height, weight measurements and date of birth was extracted for all patients admitted to UHS from 1932–2018 where the patient’s age at date of meas- urement was between 2–20 years. Data prior to 2008 were paper-based archived data transcribed into the elec- tronic EPR system since its introduction in UHS. Measurements are recorded to an accuracy of 1 decimal place for weight (kg) and height (cm). The occupation and department of the staff members entering the data was also cap- tured. Measurements of children of age less than 2 years were not considered in this assessment as the absence of gestational age data prevented accurate calculation of height for age z-scores (HAZ), weight for age z-scores (WAZ) and weight for height z-scores (WHZ). From the raw measurements of height (H, metre) and weight (W, kg), +BMI was calculated as W/H2 and HAZ, WAZ and WHZ were calculated using the LMS method15. +Data quality indicators. In assessing the quality of the captured anthropometric height and weight meas- urements, established data quality indicators for children ≥ 2 years of age were applied: (i) standard deviation (SD) of HAZ, WAZ and WHZ16 (ii) Myer’s Index (MI) for height and weight where MI is a measurement of digit preference of recorded data17. Myer’s Index calculates the divergence in the frequency of the ending digit in the measurements compared with the expected uniform distribution where there is no digit bias. The higher the value, the more biased the measurement towards a digit or two in all measurements, reflecting rounding effects. +Conventional data cleaning. The thresholds for normal ranges of HAZ, WAZ and WHZ specified by the WHO Child Growth Standards 18 were applied for height, weight and BMI measurements. Those satisfying the +condition of HAZ, WAZ or WHZ being within the [−6,6], [−6,5] and [−5,5] ranges respectively were retained for further analysis. +Implausible flagging of sparse data. When longitudinal measurement data were sparse e.g. the number of entries per individual was less than four, an implausible increment or decrement flag was applied e.g. gain or +loss of >25% of weight within one day; gain or loss of >40% of weight within three months; gain or loss of >50% of weight within one year; gain of >15% of height within three months; any decrease in height exceeding 1 cm +were flagged for manual checking. +Outlier flagging method for longitudinal data. For outlier flagging of longitudinal anthropometric measurements, robust regressions of the linear regression methodology was adopted19. Robust regressions can handle multiple outliers by introducing residual statistics including influence measurements such as Cook’s dis- tance, DFFITS, DFBETAS20 (see Supplementary for method details). Datapoints with influence statistics exceeding suggested thresholds are temporarily removed from the inference and the regression parameters are re-estimated +from the remaining data. This results in a regression line that best fits the most reliable data. It is this regression line that is used to discriminate outlying datapoints from the entire set of datapoints using the SD fold threshold θ. +Additional checks on height data. In addition to robust regression analysis of the data to detect outli- ers, height measurements were additionally inspected to flag anomalies such as variation in adult height and/or +height decrease over time as follow. Final adult height is generally reached at approximately 18 years21, therefore, variation >1 cm from the median height measurements of patients older than 18 years flagged an error in data +recording. Additionally, any decrease in height exceeding 1 cm also prompted a flag to cross check recorded data manually. This check was applied regardless of the number of datapoints in any set of measurements. +Details of the overall longitudinal height and weight data outlier flagging protocol is summarised in Box 1. +Box 1 Summary of final protocol for outlier flagging for longitudinal height and weight measurements of a patient +1. Flag data not satisfying WHO guidelines for heights, weights and BMIs whose SDS values fall beyond the ranges [−6,6], [−6,5] and [−5,5] respectively, remain n datapoints +2. If n < 4: assess the implausible increments/decrements of height and weight measurements: +i. For weight: for each pair of consecutive measurements, use the following method to flag extreme changes as below: +• Time span ≤ 1 day: beyond ±25% +• Time span ≤ 3 months: beyond ± 40% +• Time span ≤ 1 year: beyond ± 50% +ii. For height +• If time span ≤ 3 months, height increase is ≥15% +• If height measurement at time point is at least 1 cm smaller than time point, flag data at time point. +3. With the remaining data, where n > =4: +a. Apply the ordinary least square (OLS) linear regression method of the SDS values as a linear function of age (number of variables k = 1) +b. Calculate influence values: Cook’s distance, dffits, dfbeta for age. Retain data that have Cook’s distance <1, |dffits | <2 and | dfbeta_age | <2/ to re-estimate the regression line and obtain the SD +of the residuals. c. Any patient whose SD of the residuals for height or weight larger than 0.47 or 0.76 respectively has their whole series of measurements flagged for manual inspection. d. Where the SD of the residuals for height or weight is ≤1, flag any individual datapoint with resid- ual error exceeding θ x SD where θ is 2.9 for weight and 2 for height (as informed by parameter tuning). e. For height data: +i. Perform adult height check: for age measurements not flagged in (2c) within the range 18–20 years, calculate median value for that individual Mh, and flag as outlier any height measure- ment difference exceeding 1 cm. +ii. Across all age ranges and for data not already flagged, perform height decrease check. If height measurement at time point is at least 1 cm smaller than time point, flag data at time point. +4. If the total number of datapoints flagged (by any step) exceed 40% of the longitudinal data, the whole series of longitudinal data is flagged for manual inspection. +parameter tuning. Typically, datapoints exceeding 2 times the SD (θ) of any series of measurements are nominally flagged as outliers, corresponding to an outlier rate of 5%22. However, for voluminous datasets of +growth data in children, this parameter may be unnecessarily stringent. The tuning of θ was facilitated by a ‘gold-standard’ dataset from UHS, manually curated by an endocrinologist (JHD), where each patient had ≥7 datapoints (Supplementary text). This gold-standard dataset consisted of 6,279 patients with 89,258 weight meas- urements and 4,396 patients with 55,688 height measurements. Of these, 208 (0.23%) weight and 302 (0.54%) measurements were deemed ‘implausible’ by the endocrinologist. Additional height checks identified a further 191 (0.34%) height measurements failing the adult height check and 1,237 (2.22%) flagged by the height decrease + +(a) Contingency table of weight outlier flagging (b) Contingency table of height outlier flagging Weight θ = 2.9 Manual curation by clinician Height θ = 2 Manual curation by clinician Impossible Plausible Impossible Plausible Flagging by protocol Outlier 189 2,110 2,299 Flagging by protocol Outlier 1,694 2,775 4,469 Plausible 19 86,940 86,959 Plausible 36 51,183 51,219 208 89,050 89,258 1,730 53,958 55,688 Sensitivity = 90.87% Sensitivity = 97.91% PPV = 8.22% PPV = 37.91% +Table 1. Contingency tables for chosen values of θ for weight and height and their sensitivity and PPV#. #PPV is Positive Predicted Value, defined as the proportion of positive results that are true positive, PPV = TP/ +(TP + FP). + +Figure 1. Percentage of datapoints identified as true errors in the gold standard dataset stratified by year for weight and height, weight for height. Outliers were split into three types: height outlier flagging using linear regression (LR), height entry error with adult height check and height with height decrease check. +check, totalling 1,730 flagged height measurements (3.11%). This yielded a gold-standard dataset with a defined set of ‘true’ errors. +Sensitivity and specificity metrics were evaluated for θ ∈ [1.5,5.5] using the gold standard dataset. Here, a true positive (TP) was defined as a datapoint identified as an outlier that was deemed clinically implausible by the clinician, a true negative (TN) was a value that was not flagged as an outlier by our method and identified as plausible by the clinician, a false positive (FP) was a true plausible value wrongly flagged as an outlier, and a false negative (FN) was a truly implausible value not flagged as an outlier by the protocol. Therefore, the positive pre- dictive value (PPV) is an important metric to consider. Ideally, any given protocol should maximise the number of true outliers as a proportion of all data flagged for manual review while maintaining good sensitivity to detect all true outliers. +The gold-standard UHS data were used to calculate sensitivity and PPV for θ ∈ [1.5,5.5] (Fig. S4). For both height and weight, it was desirable to maintain sensitivity above 0.9 while maximising the PPV. Hence for height, +the typical value of θ = 2 was selected but for weight measurements, it was observed that increasing θ to 2.9 main- tained sensitivity above 0.9 but had a dramatic effect on reducing the manual curation of false positive outliers (Table1). These values were used in the final protocol described in Box 1. +The final selected values of θ were applied to gold standard data sets for height and weight respectively. From 55,688 height measurements, a subset of 4469 measurements (representing 2635 patients) were flagged as out- liers for manual inspection. Approximately 92% of the data passed checks and could be automatically classified as plausible. Of the 8% of flagged measurements, the 1237 (2.2%) due to decreases in height may be excluded without further clinical review and only 5.8% of the data may be subjected to further expert review or excluded depending on application. Importantly, the protocol failed to flag 36 measurements across 25 patients that the clinician subsequently flagged as implausible. This represented 0.06% of possible erroneous measurements that would go undiscovered by automated cleaning. Similarly, for weight, 2299 (2.6%) measurements from 1875 patients were flagged as requiring manual expert review while 97.4% of the data passed automated checks. Only nineteen datapoints (0.02%) that were deemed by the clinician as implausible were missed by the protocol. +All the data processing and protocol implementation was performed using the open-source programming language Python version 3.723. The ordinary least square method OLS from the Python package statsmodel24 was used to perform LR. The script for calculating SDS values of anthropometric measurements and outlier + +Figure 2. Manual outlier curation results of UHS gold standard paediatric height and weight data: (a) Percentage of outliers for each of the occupation categories for weight, height using LR, height with adult height check, and height with height decrease check. (b) Percentage of outliers for each of the department categories for weight, height using LR, height with adult height check, and height with height decrease check. +detection described by the pipeline is available for use from https://github.com/hangphan/peanof/. This includes the portable Docker container25 where all dependencies required for running the script were set up and ready to be executed on any environment where Docker is made available. +Ethics and information governance. The study was approved by the IG management team of the University Hospital of Southampton (UHS). Ethics approval from the Research Ethics Committee and Health Research Authority, and informed consent was waived by the internal review board at the R&D Department of UHS as this is a combination of an Audit against WHO guidance and Service Evaluation. The anthropometric data in UHS were retrospective data and anonymised. All methods used in this study were performed in accord- ance with the relevant guidelines and regulations. +Results +Data quality of gold-standard longitudinal data. The ‘gold-standard’ UHS height and weight data- set enabled assessment of true data quality. Chronologically, both height and weight measurements across the 2008–2018 were stable with an error rate of ~3% for height and 0.2% for weight (Fig.1). The discrepancy in error rates between the two measurements was largely attributable to decreases in height which were deemed physio- logically impossible. +Outlier rate by occupation was highest in the Pharmacist group (0.27%) followed by Others (0.20%) and Dietician (0.16%) for weight. The Pharmacist group recorded the most errors in height as assessed through man- ual review (2.4%) and using the adult height check (5.7%, Fig.2a). This likely reflects the pharmacist’s focus on estimated weight and not height for prescribing purposes. +By department, the Others group has the highest error rate for weight (0.48%) followed by Dietetics/Speech and Language Therapy and Paediatric Neurology (0.16%, Fig.2b). For height data, the highest rate of data deemed implausible though manual review was observed in Dietetics/Speech and Language Therapy (0.63%) followed by Paediatric Medicine (0.44%) and Paediatric Oncology (0.40%). Additional height checks saw the highest combined error rate in Dietetics/Speech and Language Therapy (2.05%) followed by Paediatric Oncology (1.25%, Fig.2b). +Application of automated cleaning protocol to the entire UHS paediatric height and weight dataset (n = 68,595 patients). UHS data summary and characteristics. The entire cohort contained all +records for patients aged 2–20 years, dating from 1932 to 31/12/2018. A total of 214,983 weight measurements (68,273 patients) and 146,635 height measurements (47,616 patients) were obtained for 68,595 paediatric patients in the UHS EPR (Fig.3a), resulting in 142,643 BMI values (46,479 patients). +The number of records was low prior to 2008 (1932–2008) and increased from 2008, reflecting the gradual introduction of EPR system into UHS departments, with a sharp increase in 2014 when the EGC was introduced at the end of 2013 (Fig.3b). The number of weight measurements recorded was about 30% higher than that of height during 2014–2018 period. Additional description regarding age group at initial measurement, length of follow-up time is presented in Supplementary (Fig. S4a,b). +Patients were grouped by their respective number of longitudinal height and weight measurements. There is an excess of patients with a single measurement entry and these represent approximately half of the cohort, reflecting paediatric patients with a single hospital visit to departments such as emergency. Patients with ≥7 +entries for height and weight represented ~10% of the cohort but contributed almost half of the entire dataset for both height and weight (Fig.3d,e). These represent the patient population whose ill health may confer growth and developmental irregularities requiring frequent monitoring. + +Figure 3. UHS age 2–20 years’ height and weight data (1932–2018) summary: (a) Number of patients and records of height and weight, broken down by number of datapoints per patients. (b) Total number of height, weight and BMI measurements over time from prior to 2008 to 2018 (c) Percentage of data flagged by WHO guidelines over time. (d) Number of patients within groups of patients defined by their number of longitudinal datapoints for height and weight. (e) Number of height and weight records per group of patients binned by number of datapoints per patient. + +Figure 4. One decimal place digit distribution for height and weight measurements, demonstrating the bias in recording height and weight measurements, rounding to the precision of kg for weight and the precision of cm or 0.5 cm for height. This bias is reflected in the Myers’ index of height and weight measurements. + + WAZ HAZ WHZ DHS RANGE OF SD 1.01–1.49 1.08–2.33 1.01–2.02 PRE-WHO PROCESSING SD 5.29 5.90 15.55 POST-WHO PROCESSING SD 1.45 1.32 1.36 +Table 2. Standard deviation of WAZ, HAZ and WHZ of the UHS 2–20 anthropometric measurement data. + +Figure 5. UHS data characterisation by occupation and by department of staff entering the data (a) Weight records by occupation (b) Height records by occupation (c) Percentage of height and weight data flagged by WHO rules by occupation (d) Weight records by department (e) Height records by department (f) Percentage of height and weight data flagged by WHO rules by department. +Data quality by conventional quality indicators. The number of records failing WHO child growth standard guidelines for weight, height and BMI measurements were 1,386 (0.95%) and 814 (0.38%) and 677 (0.47%) respectively. The percentage of records excluded based on WHO limits was highest in 2013 at 2.37%, 2.64%, and 2.71 for weight, height and BMI respectively (Fig.3c). This coincides with the gradual introduction of EGC into various departments across UHS in 2013, reflecting a transient increase in error rate during the transition period to the electronic recording of data. A comparison of the five years preceding the transition to electronic data recording and the five years following 2013 identified a significant reduction (p = 9.97 × 10−23, p = 1.05 +weight height +× 10−8) in these extreme data recording errors. +The SD of HAZ, WAZ and WHZ was calculated and compared against reported ranges of SD observed in the 52-country DHS survey16 (Table2). The SD values prior to exclusion of WHO extreme datapoints fell significantly outside the expected ranges. However, after exclusions of these extreme values, the observed SD values for height, weight and BMI z-scores fall within the expected limits. +The Myer’s Index (MI) for digit preference of height data (excluding WHO extreme values) is consistent with the average observed across 51 countries in the DHS survey (MIUHS = 17.91, MI = 17.8, Fig.4). The +MI for weight data is higher (MIUHS = 10.69, MI51_country_average = 4.6) suggesting a51_cogreunatt erry_a tveneragdene cy for estimation in UHS weight data. +Data quality indicators by occupation and department of entry staff. The quality of the extracted data was also scrutinised by staff occupation and department to understand the most likely source of erroneous data and target the training in anthropometric assessments. +For 75% of the observed data, the occupation and department of the staff member entering the data was available for evaluation. Ninety-three different staff occupations across 96 different departments were noted and the ten staff occupations that most frequently entered height and weight measurements are presented in Fig. 5a,b. Healthcare assistants most frequently recorded weight and height data (24% and 30% respectively) followed by Healthcare support workers, Staff nurses and Consultants. +Application of the WHO flags for extreme values identified a low and consistent level of less than 1% of likely data entry error across occupations (Fig. 5c). The most striking peak in this type of error was 7.5% noted in the height data entered by pharmacists. However, given pharmacists entered only a very small proportion of the overall height data (n = 214 records) this higher error rate reflects a very small number (n = 16) extreme values. +The Paediatric outpatient department contributed most data for weight and height measurements (47% and 58% respectively; Fig.5d,e). The WHO violation rate by department was small and relatively consistent across departments. The highest rate identified was 1.2% amongst weight values recorded within the Paediatric Endocrinology department (Fig.5f). +Outlier detection for patients with longitudinal records in UHS dataset. For those with 2–3 height measurements, the implausible flagging method identified 655 (2.21%, 607 patients) height decreases >1 cm (Table3). No height + +Patient group Filter Weight Height All WHO 1,386 (n = 864) 814 (n = 527) 2–3 Extreme change 119 (n = 114) 655 (n = 607) 4–6 OLS robust, few remain 680 (n = 170) 292 (n = 73) Large SD 114 (n = 24) 296 (n = 61) LR 3,626 +(n = 3,531) 3,029 This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Scientific RepoRtS | | https://doi.org/10.1038/s41598-020-66925-7 8 diff --git a/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt b/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt new file mode 100644 index 0000000..9d13890 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/83-Cross-ScenarioPerformanceModelingforBigDataEcosystems2020.txt @@ -0,0 +1,108 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ + +See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/342834960 +Cross-Scenario Performance Modelling for Big Data Ecosystems +Chapter · July 2020 +DOI: 10.1007/978-3-030-50334-5_14 +CITATIONS READS +0 47 +2 authors, including: +Fatimah Alsayoud +Arab Open University - Saudi Arabia +5 PUBLICATIONS 2 CITATIONS +SEE PROFILE +All content following this page was uploaded by Fatimah Alsayoud on 08 March 2023. +The user has requested enhancement of the downloaded file. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Metadata of the chapter that will be visualized in SpringerLink + +Book Title Artificial Intelligence in HCI Series Title Chapter Title Cross-Scenario Performance Modelling for Big Data Ecosystems Copyright Year 2020 Copyright HolderName Springer Nature Switzerland AG Author Family Name Alsayoud Particle Given Name Fatimah Prefix Suffix Role Division Department of Computer Science Organization Ryerson University Address Toronto, Canada Email Corresponding Author Family Name Miri Particle Given Name Ali Prefix Suffix Role Division Department of Computer Science Organization Ryerson University Address Toronto, Canada Email Ali.Miri@ryerson.ca Abstract Performance prediction is an essential aspect of several critical system design decisions, such as workload scheduling and resource planning. However, developing a model with higher prediction accuracy is a challenging task in big data systems due to the stack complexity and environmental heterogeneity. Workload modelling aims to simplify the connection between workloads factors and performance testing. Most of the workload models rely on a single scenario under test (SUT) method, where the trained and the evaluated data have the same distribution. However, a single SUT is not the ideal modelling method for big data workloads, as SUTs change frequently. Big data systems have a considerable amount of possible test scenarios that are generated from changing one or more elements in the testing environment, such as changing benchmarks, software versions, or cloud service types. To address this issue, we propose a cross- Scenario workload modelling method that aims to improve the workloads’ performance classification accuracy. The proposed approach adopts the Transfer Learning concept for reusing models cross different but related scenarios. In this work, we evaluate the proposed approach on multi real-world scenarios in Hadoop which is an example of big data system. The empirical results showed that the proposed approach is more accurate than SUT method. Keywords Performance - Modelling - Transfer learning - Big data ecosystems +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +Cross-Scenario Performance Modelling +for Big Data Ecosystems +Fatimah Alsayoud and Ali Miri (B) +Department of Computer Science, Ryerson University, Toronto, Canada +Ali.Miri@ryerson.ca +Abstract. Performance prediction is an essential aspect of several crit- +ical system design decisions, such as workload scheduling and resource +planning. However, developing a model with higher prediction accuracy AQ1 is a challenging task in big data systems due to the stack complexity and environmental heterogeneity. Workload modelling aims to simplify the +connection between workloads factors and performance testing. Most of +the workload models rely on a single scenario under test (SUT) method, +where the trained and the evaluated data have the same distribution. AQ2 However, a single SUT is not the ideal modelling method for big data +workloads, as SUTs change frequently. Big data systems have a consid- +erable amount of possible test scenarios that are generated from chang- +ing one or more elements in the testing environment, such as changing +benchmarks, software versions, or cloud service types. To address this +issue, we propose a cross-Scenario workload modelling method that aims +to improve the workloads’ performance classification accuracy. The pro- +posed approach adopts the Transfer Learning concept for reusing models +cross different but related scenarios. In this work, we evaluate the pro- +posed approach on multi real-world scenarios in Hadoop which is an +example of big data system. The empirical results showed that the pro- +posed approach is more accurate than SUT method. +Keywords: Performance · Modelling · Transfer learning · Big data ecosystems +1 Introduction +Big data ecosystems have become the main element in today’s technology. The ecosystems support big data sets and provide a variety of execution methods to meet system workload requirements. Big data ecosystems contain heterogeneous hardware and software, and they support a variety of data and workloads. +Designing optimal management policies and actions for big data ecosystems requires active monitoring and intelligent modeling. The model deign to test a particular objective like performance. Modeling for performance testing is one of the most successful management analyzing approaches. It can be used to measure the performance of a specific system object or a specific executing workload. In + c Springer Nature Switzerland AG 2020 +H. Degen and L. Reinerman-Jones (Eds.): HCII 2020, LNCS 12217, pp. 1 18, 2020. https://doi.org/10.1007/978-3-030-50334-5_14 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Cross-Scenario Performance Modelling for Big Data Ecosystems 7 +both cases, the performance testing design is impacted by the characteristics of the running workloads. For example, a Hard Disk Drive (HDD) delivers its best performance when it serves sequential access workloads and not random access workloads. Another example is that the Hadoop ecosystem performs better with analytic workloads than Online Transaction Processing (OLTP) workloads. +Workload performance modeling provides an approach to examine perfor- mance on a particular Scenario Under Test (SUT), where the scenario can include the deployment solution, the software version or the benchmark setup of a par- ticular Object Under Test (OUT). An example of OUT is Application Under Test (AUT). In general, the model result is a significant input element on many system decisions such as resource allocation. Therefore, it is crucial to design an accurate workload model as the performance test results reliability level is in line with the model accuracy. +Designing an accurate workload model for big data ecosystems is a chal- lenging task due to ecosystem complexities and heterogeneity. There are several possible SUTs and lots of different case studies in big data ecosystems. For example, it is typical for the same ecosystem to have multi software versions, test workload performance with different benchmarking tools and to be executed on various deployment solutions [1]. +Different SUTs produce dissimilar workload distributions. Many workload modeling approaches assume that trained and evaluated data has a similar dis- tribution which is the same assumption as ML methods [2]. This assumption does not fit with big data ecosystem characteristics where the workload’s distribution is changed with many possible SUTs. Constructing a model for each SUT from scratch is time-consuming and resource intensive. A similar distribution assump- tion does not work well in many real-life cases. For example, in computer vision, there is a need to recognize numbers either coming from handwritten data or from a picture where they have dissimilar distributions. +A number of deep learning related methods such as Transfer Learning (TL) are developed to deal with the distribution similarity constraint. TL provides a method to transfer knowledge between domains with a dissimilar distribution or dissimilar feature space to avoid building a fresh model every time the SUT is changed and to improve the model’s accuracy. It is a well-used method in computer vision and natural language processing researchers. In this work, we will use TL to improve the performance model in a big data ecosystem. +1.1 Problem Statement and Motivation +The need for an accurate performance model remains even when the SUT or the executing workload is changed in a big data ecosystem. Designing an accurate model for a big data ecosystem such as Hadoop while considering SUT and workloads changing is a challenging task. Although there is a lot of Hadoop performance modelling work such as [3,4] and [5], most of it focuses on a single SUT. Only some consider multi SUT. For example, [6] provide a comprehensive analysis of how the workload behaviour, characteristic and distribution changes with SUTs change, and [7] designed a map task scheduling model for multi +cloud service under test. However, none of the work considers improving the performance model for a particular SUT by utilizing another SUT model. +In practice, users typically change the setups to meet individual or application needs. For example, a big data ecosystem may be moved from on-premise to the cloud when there is a need for more storage. Another example is changing the benchmark measurement tool to analyze different SW elements. Although SUTs usually change frequently on a big data ecosystem, the scenarios modification factors have not been considered on the big data performance modelling yet. +In this paper, we investigate the accuracy of a big data ecosystem perfor- mance model with the proposed cross-scenario transfer approach. This approach builds a performance model based on a particular SUT (Scenariosrc ) and then transfers the source knowledge into another SUT (Scenariotgt ) to improve the target model’s accuracy. A cross-scenario transfer approach adopts the inclusion method (multi scenarios) instead of the isolation (single scenario) method that is used by most existing performance modelling approaches. The inclusion method relaxes the sensitivity between model accuracy and the SUT characteristic. We demonstrate the approach with four scenarios: benchmarks, cloud service types, and Hadoop versions each with a couple of hypotheses. The experiential results show noticeable model accuracy improvement on the Scenariotgt with the pro- posed approach. +The paper is organized as follows. Sections 2 and 3 give a background of work- load modelling and performance modelling challenges. The proposed approach overview is presented in Sect. 4. The evaluated case studies and the experimen- tal result are discussed in Sect. 5. Finally, related work and the conclusion are presented in Sect. 6 and Sect. 7, respectively. +2 Workload Modelling +In general, modelling provides a foundational methodology to abstract and rep- resent a particular aspect or relationship. Workload modelling establishes a con- nection between the workload characterization and the desired testing object. It helps to track how the workload and the corresponding testing object are changing. There are several possible algorithms for workload modelling such as predication, evolution, optimization and simulation. The algorithm is selected based on the model’s objective. It is important to select the right design factors and define an accurate workload model. This is because many critical manage- ment decisions are using it as one of their fundamental elements. +Today’s big data ecosystems serve a variety of workload types such as Online Transaction Processing (OLTP), Decision Support System (DSS), analytical and Machine Learning workloads. Each type has unique attributes and characteriza- tion. Moreover, the workload’s pattern, behaviour and distributions change with the execution environment. Workload behaviours are very sensitive to execution environment components, setups and capability. +Workload modelling provides a method to simplify the relationship between workload characterization and behaviours with the desired testing object for a +particular testing environment [8]. The testing object is the workload attributes that the model is designed to test it, such as performance, cost and resource utilization. The object measurement metric defined during the model construc- tion is based on the final objective. For example, performance can be measured based on the workload’s execution time or the throughput. Another essential aspect of workload modelling is the testing environment that affects workload behaviour and testing object values. In general, the model design is based on data from an environment with an aggregation of SWs and HWs. However, usu- ally only one of the environmental elements is used to define the testing factors. For instance, in the application performance model, the application represents the testing environment and performance represents the testing object. Usually, the test application is called Application Under Test (AUT). The application performance model or workload model for performance testing investigates the relationship between application workloads and the corresponding performance. +Each aspect of the workload model should be designed and selected care- fully since the accuracy of the design affects the accuracy of many management decisions and actions. The model can be used for descriptive, predictive and prescriptive analytics where the analytics output, for example, produces perfor- mance insight or predicts resource provisioning. The workload model can also be used for simulating workloads [9] and evaluating a system configuration [10]. Indeed, the workload-aware concept becomes a common aspect of different man- agement architecture. +Workloads have different behaviours and patterns that change based on many factors like workload structure and the testing environment. For example, the behaviour of database workloads is different than the ML workloads. The last one is more complicated, requiring more resources and taking more time than the first one. The challenge occurs when a particular environment serves both types of workloads which is a normal situation in today’s applications. The workload- aware concept is adopted on the system to serve each workload with its need, and define the management decision and action differently for each workload. +3 Big Data Performance Modelling Challenges +Modeling big data workloads for performance testing or in short performance modelling is a challenging task due to the ecosystem’s complexity and the vari- ability of the workload. It is challenging to design an accurate model for a big data ecosystem that has many interacting components and for workloads with very wide distributions. Traditional performance modelling assumes that data comes from a single SUT and has the same distribution. Both assumptions do not meet the need of big data ecosystems. Big data ecosystems have a complex architecture with several stages, multi-configuration parameters and multi SW elements. These ecosystems contain many highly interactive stages such as com- puting, resource management and a distributed file system which control how the workload is executed, how many resources are allocated to it and where it should be placed, respectively. Each of the controlling decisions impacts the workload’s +overall performance. Furthermore, the ecosystems have a massive amount of pos- sible configuration parameters. Each of them has multiple possible values and each of the values affects the performance differently. +The SW elements in big data ecosystems are dependent on each other and some of the elements interact with elements from other ecosystems. For example, the Hadoop resource management element (YARN) [11] is used by many other systems such as Spark [12] and Storm [13]. Also, the Hadoop file system (HDFS) is used by OpenStack Swift and Amazon S3 [14]. The SW characteristics and the interaction have an implication on workload behaviour and therefore workload performance. +Each aspect of the big data ecosystem architecture impacts the performance of the workloads and can cause a change in workload distributions. It is hard to keep track of how each aspect of the ecosystem impacts performance. As written by [1] “we do not know much about real-life use cases of big data systems at all”. +Two well-known modelling methods are used for simplifying big data ecosys- tem complexity: white box and black box methods. White box applies when the internal details are essential factors for decision making like considering configu- ration values for configuration tuning [15] or configuration optimization [16]. In contrast, the black box method does not consider the internal ecosystem details, and it is used by most work that focuses on the testing output instead of ecosys- tem details. Most of the black box methods and many of the white box methods follow the original modelling assumption of using a single SUT with the same distribution. Such assumptions would require building a considerable number of models from scratch to cover the possible big data scenarios. The proposed approach in this work benefits from the pre-built models on constructing a new one to improve model accuracy, and save model construction time and resources. +3.1 Scenario Under Test (SUT) Modelling +Most performance modelling approaches rely on a single SUT where data is collected from the same environment setups. For example, if the desired test object is an application, then the model is built based on collecting or simulating data from a particular application. Usually, the model built for a particular application cannot work as accurately for another application. +The performance modelling single SUT requirement is coming from the algo- rithm’s restriction used on the model. The most used algorithms in performance modelling are analytic and Ml algorithms. Both types of algorithms require the trained data and the evaluated data to have the same distributions and feature space. To guarantee those requirements, the performance model expected data needs to come from a single SUT. +The issue is that most of today’s case studies deal with changing the original scenario for different reasons. The model’s accuracy cannot be guaranteed when any of the SUT factors are changed. For this reason, in most cases, the whole model has to be reconstructed when any change happens. A large number of models are needed to cover all of the possible scenarios. +Even though a single SUT method gets great attention from both industrial and academic communities, it has several limitations such as lack of supporting diverse scenarios. It requires contracting many models and isolating the built model from the other related models. It consumes time and resources, and is sensitive to workload distributions. A single SUT limitation motivates us to define the cross-scenario method that can support multi-scenarios in big data ecosystems and improve performance model accuracy. +4 Proposed Approach Overview + +Fig.1. Cross-Scenarios transfer performance modelling +The proposed approach overview is illustrated in Fig. 1 and the procedures are listed below: +– The examined dataset is Hadoop execution trace-data that is provided by the ALOJA open-access dataset [17]. The dataset has over 16.000 Hadoop executions with various setups like workload type, benchmark type, Hadoop versions, cloud service types and cloud providers. +– To provide the cross-scenarios transfer method with the correct data, both the Source Scenariosrc and Target Scenariotgt have to follow the same prepa- ration process. For example, the process includes normalizing numeric data, coding categorical data and classifying the target output. +– Once the dataset is prepared, the Scenariosrc and the Scenariotgt are defined according to the desired hypothesis. For each examined hypothesis, the defi- nition of the Source and Target scenarios are specified in Sect. 5. +– The Cross-Scenarios transfer method applies for each formulated hypothe- sis. The method contains three steps: build the source model according to Scenariosrc, build the target model according to Scenariotgt , and build the cross-scenarios transfer model according to the built source model and the Scenariotgt. +– Source and Target models are constructed with Multi-Layer Perceptron (MLP). +– The built source model knowledge is used to build a cross-scenarios transfer model for the Scenariotgt. +– The accuracy of results for the target (stand-alone) model and the target (cross- scenarios transfer) are analyzed for each hypothesis. +– We execute each hypothesis three times to calculate the average result of stand-alone and Transfer Learning models. +– To study the impact of sample size on the model’s accuracy, we examined each hypothesis with six sample size 50,150,250,350,450,and500 that represents in the experiments as a ratio. +4.1 Methodology +Transfer learning is defined to relax distribution similarity constraints on trained and the evaluated data. TL assumes that the trained dataset and the validated dataset have different but related distributions. The TL method can be applied to almost all of the learning models such as classification, regression, and clus- tering. It provides a way to transfer knowledge between different learning tasks or between different domains. There are two types of domains: Source and Tar- get. The Source domain is where the knowledge transfers from and the Target domain is where the knowledge transfers to. +5 Case Studies and Experimental Result +In order to evaluate the proposed approach, three different case studies are defined as Hadoop software versions, benchmark types and cloud service types. Each case study contains real-life scenarios that are used to determine the exam- ined cross-scenario transfer. +5.1 Software Versions +Commercial and open-source software companies produce new software versions either to add new features or fix the software bugs. This can happen at any stage of the software life cycle. The frequency of producing new versions is in accor- dance with the software design model. In general, open-source software, such as big data ecosystems, release new minor and major versions more repeatedly than commercial software. +Versions have different configurations and therefore, the trace data that is produced is different in products. The trace-based method is the most used work- load modelling method. Following how versions change is not a straightforward +Table 1. Experimental results: Hadoop versions hypothesis + +Hypothesis (Hadoop-1.0.3 → Hadoop-1.2.1) (Hadoop 1 → Hadoop 2) (Hadoop-1.2.1 → Hadoop-2.7.1) Sample ratio Stand-alone TL Stand-alone TL Stand-alone TL 10% 0.236 ± 0.043 0.371 ± 0.100 0.270 ± 0.040 0.391 ± 0.017 0.243 ± 0.070 0.278 ± 0.063 30% 0.310 ± 0.035 This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. diff --git a/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt b/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt new file mode 100644 index 0000000..e3368cd --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/9 - Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets.txt @@ -0,0 +1,160 @@ + +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ + +Received July 23, 2020, accepted August 2, 2020, date of publication August 7, 2020, date of current version August 20, 2020. Digital Object Identifier 10.1109/ACCESS.2020.3015016 +SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing +of Large NGS Datasets +ROBERTO R. EXPÓSITO , ROI GALEGO-TORREIRO, AND JORGE GONZÁLEZ-DOMÍNGUEZ +Universidade da Coruña, CITIC, Computer Architecture Group, 15071 A Coruña, Spain +Corresponding author: Roberto R. Expósito (roberto.rey.exposito@udc.es) +This work was supported in part by the Ministry of Science and Innovation of Spain under Grant TIN2016-75845-P +and Grant PID2019-104184RB-I00, in part by AEI/FEDER/EU under Grant 10.13039/501100011033, and in part +by the Xunta de Galicia and FEDER funds (Centro de Investigación de Galicia accreditation 20192022 and +the Consolidation Program of Competitive Reference Groups) under Grant ED431G 2019/01 and Grant ED431C 2017/04. +ABSTRACT This paper presents SeQual, a scalable tool to efciently perform quality control of large genomic datasets. Our tool currently supports more than 30 different operations (e.g., ltering, trimming, formatting) that can be applied to DNA/RNA reads in FASTQ/FASTA formats to improve subsequent downstream analyses, while providing a simple and user-friendly graphical interface for non-expert users. Furthermore, SeQual takes full advantage of Big Data technologies to process massive datasets on distributed-memorysystemssuchasclustersbyrelyingontheopen-sourceApacheSparkclustercomputing framework. Our scalable Spark-based implementation allows to reduce the runtime from more than three hours to less than 20 minutes when processing a paired-end dataset with 251 million reads per input le on an 8-node multi-core cluster. + INDEX TERMSBigdata,next-generationsequencing(NGS),bioinformatics,qualitycontrol,apachespark. +I. INTRODUCTION the pipeline. For instance, transforming the input data from The development of Next-Generation Sequencing (NGS) FASTQ to FASTA format may be necessary if any bioinfor- technologies [1], [2] has revolutionized biological research maticsapplicationcanonlyworkwithdatastoredinthelatter over the last decade by drastically decreasing the cost format. Currently, there are several tools to perform quality of DNA/RNA sequencing and signicantly increasing the control andpreprocessing of rawNGS datain order toensure throughput of generated data. The quality of NGS data is the necessary quality for further processing [4], [5]. considered very important for various downstream analyses However, state-of-the-art tools still require excessive time suchasgeneexpressionstudiesandgenomesequenceassem- to process the increasingly large datasets generated through bly [3]. However, NGS platforms introduce, as a downside, mainstream NGS platforms. Although there are some par- different kinds of artefacts in the raw sequence fragments allel tools that allow to accelerate their computations on (theso-called``reads'')suchasduplicates,poor-qualityreads shared-memory systems thanks to including efcient multi- and insertions/deletions, which can lead to serious negative threading support, this is not enough to complete the quality impact on downstream analyses. Therefore, most bioinfor- controlofcurrentlargedatasetsinreasonabletimesincetheir matics pipelines start by applying a quality control over the scalability is limited to the resources of a single machine. input datasets in order to increase the accuracy of subse- In this context, the exploitation of Big Data technologies quent processing. Some examples of these operations are seems an adequate approach in order to accelerate those the removal of duplicate reads, the deletion of reads with calculations on distributed-memory systems such as clus- low average quality, or their transformation to maintain only ters and cloud platforms, as extensively demonstrated by the fragments with high quality (trimming). Moreover, dur- the existing literature [6][8]. In this paper we introduce ing this preprocessing step the datasets sometimes must be SeQual1,ascalabletoolforqualitycontrolandpreprocessing transformed in order to adapt them to the requirements of of raw sequencing data implemented upon the most popular open-source distributed framework for Big Data processing: +The associate editor coordinating the review of this manuscript and +approving it for publication was Juan Wang . 1Source code available at https://github.com/roigalegot/SeQual. +VOLUME 8, 2020 This work is licensed under a Creative Commons Attribution 4.0 License. For more information, see https://creativecommons.org/licenses/by/4.0/ 146075 + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +Apache Spark [9]. SeQual is mainly inspired by PRINSEQ [10], one of the most popular tools for quality control which has been widely used in many recent biological studies [11], [12].ThemainadvantagesofPRINSEQoveralternativetools are its simplicity and great functionality, providing support not only for a wide range of quality control operations (such as ltering and trimming), but also for data formatting. Our toolalsoprovidesallthisfunctionality(andevenmore)butin a signicantly lower runtime by fully exploiting the parallel processing capabilities of Spark. Although there are a few parallel tools to remove duplicate DNA/RNA sequences (one specic operation that can be used for quality control) on distributed-memory systems [13], [14], up to our knowledge, SeQual is the rst publicly available tool intended for this typeofparallelsystemsthatprovidesfullfunctionality(more than 30 operations) instead of only allowing to remove dupli- cate reads. Furthermore, SeQual includes a graphical user interface intended for simplifying its usage. +The remainder of the paper is organized as follows. Section II discusses the related work. Section III describes the overall functionality provided by SeQual. Section IV describes our parallel approach. The performance of SeQual is evaluated and compared to state-of-the-art quality control tools in Section V. Finally, Section VI concludes the paper and proposes future work. +II. RELATED WORK +To address the sequencing quality problem, besides the quality control pipeline supplied by some sequencing plat- form manufacturers, several standalone tools have been proposed in the literature. A representative list includestools such as FASTX-Toolkit [15], FastQC [16], PRINSEQ [10], NGS-QC [17], QC-Chain [18], FaQCs [19], Trimmo- matic [20], PEAT [21], AfterQC [22], FastProNGS [23] and PRINSEQCC [24]. With the expected increase in total generated data and decrease in costs associated with NGS technologies, one important concern is their processing speed. Some tools do not provide parallel implementations (FASTX-Toolkit, PRINSEQ), whereas others (FastQC) han- dleparallelismonlyatthelelevel,sotheycannotaccelerate the processing of a very large single dataset. The remaining tools do provide some kind of parallel support but all of them are based on multithreading, so their overall speed is limited to the computational resources of a single machine. +In terms of functionality, FastQC does not have trimming and ltering features, whereas Trimmomatic is focused on just one operation type (trimming), and PEAT provides very few lter options to the users. FASTX-Toolkit does not even support paired-end datasets, requiring further postprocess- ing to link paired reads. Other tools (FaQCs, FastProNGS) do not support FASTA as input format, while also pro- vide basic user interfaces only limited to command-line interaction. Moreover, there are tools that just seem to be currently unavailable as their websites do not longer work (NGS-QC, QC-Chain). Among all of them, PRINSEQ is by far the solution that provides the widest functionality +supportingdifferentquality-controlandpreprocessingopera- tions together with a nice web-based graphical user interface. This is the main reason why the functionality of SeQual has been based on PRINSEQ, even extending it. However, the sequential implementation of PRINSEQ using Perl clearly hinders its performance for large datasets, whereas itsmultithreadedCCCversion(PRINSEQCC)ismuchfaster butprovideslessfunctionalitythantheoriginaltool,whileits scalability is still limited to a single machine. +SeQual tries to combine the functionality and usability of PRINSEQ together with the performance of PRINSEQCC but in a distributed manner relying on Big Data technologies. In fact, the exploitation of Big Data clusters to accelerate the storage, processing and visualization of large NGS datasets has been recently explored in multiple previous works. For instance, many bioinformatics tools implemented on top of Big Data processing frameworks such as Hadoop [25] and Spark [9] have emerged in recent years, from error correction [26], [27], duplicate read removal [13] and sequencealignment[28][31], tovariantcalling[32],denovo genome assembly [33], [34] and protein structure prediction [35][37], among many others. Most of these tools are exe- cutedwithinabioinformaticspipeline(orscienticworkow engines such as SAASFEE [38] or Pegasus [39]) that usually starts with a quality control of the input FASTA/FASTQ datasets. Therefore, they will benet from SeQual in order to accelerate this rst step of the pipeline, which reinforces the need of our proposal in the context of quality control and preprocessing. +III. OVERVIEW OF SeQual +SeQual is a parallel tool implemented in Java that currently provides a full set of 33 operations for performing qual- ity control and preprocessing on raw NGS datasets. It can receive as input either single-end or paired-end DNA/RNA sequences, which can be stored either in FASTA or FASTQ les, as these are the most popular unaligned sequence for- mats. The operations provided by SeQual can be divided into the following four main functionalities: +1) Filters. These operations discard those input reads that do not fulll a certain criteria specied by the user. Filters are divided into two categories, depending on the number of sequences involved in the lter ruleV +• Single lters, which evaluate reads one-by-one. SeQual includes 12 single lters. For instance, sequencescanbelteredaccordingtotheirlength, quality or the absence/presence of a certain pattern in their bases. +• Group lters, which compare reads by pairs and discard those that are equal (keeping the one with the highest quality score when possible). SeQual contains 5 group lters that allow, for instance,tocomparethesequencesascomplement or reverse-complement. The user can also specify acertainnumberofallowedmismatchestodiscard those sequences that are almost equal. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 +146077 +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets + +FIGURE 1. Graphical user interface included with SeQual. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +2) Trimmers. SeQual includes 10 operations in order to trim the beginning or ending of the sequences by removing those bases that are not interesting for the user. The user can specify the number of bases that must remain, or the quality required for the trimmed sequences. +3) Data formatters. Three functions to convert from DNA to RNA reads (and vice versa) or from FASTQ to FASTA formats are also provided by our tool. +4) Statistical operations. Finally, SeQual provides three additional functions to obtain some statistics about the initial and/or nal data. For instance, these operations can be used to count the number of input sequences, or to calculate their average length/quality. +Regarding to the usage of the tool, SeQual provides two execution modesV +• Through the command-line interface by specifying: +(1) the path to the dataset(s) as input arguments; (2) the operationstobeperformedonthesedatasetsusingaJava Properties le. +• Through a graphical interface provided by SeQual in order to simplify its usage to non-computer science experts (see Fig. 1). This graphical interface has been implementedupontheopen-sourceJavaFXproject[40], whichallowsbuilt-inseparationbetweentheapplication logic and the visual part of SeQual. +It is worth noting that the user can apply multiple operations to the same input dataset in a single execution (see the available check boxes in Fig. 1). In this scenario, +SeQual implements a priority-based strategy for all lters and trimmers to improve overall performance when multiple ones are selected by the user. Based on their priority, SeQual automatically sorts them to apply rst those lters that can potentially discard more reads and those trimmers that can reduce more their length. This strategy aims to reduce overall runtime as subsequent operations can be accelerated taking advantage of this approach. +For more details about all the available operations, compilation and execution instructions, as well as a brief overview of the graphical interface, refer to the detailed README le available at the SeQual's website. +IV. IMPLEMENTATION +At the highest level of abstraction, the overall workow of SeQual is divided into the following three main stages: +1) Reading of the input dataset(s) specied by the user, consisting of one or two FASTQ/FASTA text-based sequence les when working in single- or paired-end mode, respectively. +2) Processing of the input les according to the quality-control operations selected by the user in the graphical interface or, otherwise, specied in a Properties le when using the command-line interface. +3) Writingoftheprocesseddataset(s)totheircorrespond- ing output text les as a result of the computations previously performed. +In order to understand how these stages have been imple- mentedontopofSpark(SectionsIV-BandIV-C),somebasic + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets + +FIGURE 2. Spark example of combining map/filter transformations and count action over an RDD of type Integer. + +FIGURE 3. Example of two DNA reads in FASTQ format (100 base pairs). + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +concepts about the programming model provided by this Big Data framework need rst to be introduced (SectionIV-A). +A. APACHE SPARK +Spark [9] is a popular Big Data processing framework that supports efcient in-memory computations by relying on a novel, distributed data abstraction known as Resilient Dis- tributed Dataset (RDD) [41]. Basically, an RDD is a par- titioned collection of data elements that can be distributed across the nodes of a commodity cluster. One important feature of RDDs is that their partitions can be operated in parallel and cached in memory to be reused in subsequent MapReduce-like operations [42]. A Spark programmer can create an RDD in two different ways: either by parallelizing an existing collection of objects (e.g., a list); or by loadingan external dataset from a supported le system. In order to allowdataprocessinginadistributedmanner,Sparkprovides support for the Hadoop Distributed File System (HDFS) [43] so that RDDs can be created and efciently processed from datasets stored in it. Nowadays, HDFS is considered the mostpopularopen-sourcedistributedlesystemforBigData processing, providing the fundamental storage layer within the Hadoop ecosystem [25]. +The RDD programming API provided by Spark supports a wide range of data-parallel operations that can be performed over an RDD. Those operations can be divided into trans- formations and actions. On the one hand, transformations (e.g., map, lter, join) create a new RDD from an exist- ing one. For instance, a map transformation processes each RDD element through a user-dened function, returning a new RDD as result. Another example is lter, which returns a new RDD formed by selecting only those elements of the source RDD on which a user-dened function returns true. Note that transformations are lazily evaluated in Spark, so they do not compute anything until an action that requires the result from them is triggered. On the other hand, actions return non-RDD values, converting the laziness of transfor- mations into actual computation. Actions can be used to either return a result to the main Spark program (e.g., reduce, collect, count), or to store an RDD in external storage after running a certain computation (e.g., saveAsTextFile, +saveAsObjectFile).Forinstance,thereduceactionaggregates all the RDD elements according to a user-dened function and returns the nal result to the main program. As an illus- trative example, Fig. 2 shows the chaining of a map and lter transformations together with a count action over an RDD oftypeInteger.Notethattheuser-denedfunctionsexecuted overtheinputRDDareshownbelowthecorrespondingboxes for map and ltertransformations. +Finally,anotherinterestingfeatureofSparkisthatitallows to explicitly cache or persist the RDD elements in memory, thus providing much faster access to them the next time they are queried. This is extremely useful for implementing efcient iterative algorithms [44]. +B. RDD MANAGEMENT IN SeQual +All the RDD objects managed by SeQual are created from the input datasets stored in HDFS, which represents the rst stage of the overall workow previously described. The most straightforward way to create an RDD from an input text le stored in HDFS would be using thetextFile method provided by Spark. Unfortunately, this method is not able to handle properly the specic structure of the FASTQ/FASTA text-based le formats, as both involve mul- tiplelinespersequence(e.g.,fourlinesforFASTQ,asshown intheexampleofFig.3).ThisSparkmethodreliesbydefault on newline characters to identify the individual records in the input le (i.e., it creates one input record per line). Although it is possible to change the default delimiter to separate individual records according to the sequence format (e.g., FASTQ reads begin with character `@'), this solution would not work since such character can also occur in the string that represents the quality scores associated with each base (qualities are stored in the fourth line of each FASTQ read, as shown in Fig. 3). +To overcome such issues, other previous bioinformatics tools implemented using Big Data technologies [28], [45] generallyperformapreprocessingoftheinputlestoconvert them into the required line-by-line format (i.e., one read per line). Next, the converted les are copied to HDFS to be processed. In the specic case of Spark, another solution is to create the RDD using the previous textFile method + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 +146079 +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets + +FIGURE 4. SeQual example of combining DNATORNA and TRIMLEFT operations. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +and then operate over it with additional transformations and actions to obtain the desired format [29]. However, those approaches incur additional disk/memory overheads, degrad- ing the overall performance. Instead, SeQual relies on the Hadoop Sequence Parser (HSP) library [46] to create the input RDDs in order to avoid any additional preprocess- ing/transformation of the input les. HSP is a Java-based library that provides specic and optimized routines to parse FASTQ/FASTA les directly from HDFS, and it is cur- rently compatible with Hadoop, Spark and Flink [47] data processing frameworks. +Once the input RDDs are created using the HSP library (rst stage), the transformations and actions provided by the Spark's API can process their partitions during the second stage according to the quality-control operations specied by the user, as will be explained in the next subsection. Finally, the RDDs resulting from performing those operations are written back to HDFS by SeQual to create the output les (third stage). In this case, Spark provides a suitable RDD action (saveAsTextFile) to do so straightforwardly. +C. SPARK-BASED QUALITY CONTROL AND PREPROCESSING +To efciently implement all the functionality provided by SeQual (see Section III), each supported quality operation must be translated into the appropriate combination of trans- formations/actions to be performed over the input RDDs which have been previously created using the HSP library. +Regarding to single lters, these operations were imple- mented using an RDD ltertransformation, as they evaluate input reads one-by-one. As mentioned before, this transfor- mation returns a new RDD that contains only those elements of the input RDD on which a user-dened function returns true.So,theimplementationofeachsinglelterprovidestwo functions for single- and paired-end mode, and their specic logic depends on the rule used to lter out sequences. For instance, the LENGTH lter compares the length of each read(i.e.,thenumberofbases)withaminimumormaximum threshold specied by the user, returning false when the read must be ltered out from the resulting RDD and true otherwise. +Group lters represent a much more complex computa- tion as input reads are compared by pairs. For instance, the DISTINCT lter requires to check all read pairs in order to remove duplicated sequences. These group lters rst gener- ateaPairRDD,whichisanRDDconsistingofkey/valuepairs +as elements. To do so, these operations apply a mapToPair transformation to the input RDD, which is similar to map but itallowsreturningaPairRDD.Thefunctionexecutedbymap- ToPairoutputsaskeyastringthatrepresentsthebasesofeach read for the DISTINCT lter (or the reverse, complementary or reverse complementary if the lter requires so). As value, the function outputs the sequence object itself, which con- tains not only the bases but also the sequence identier and the qualities (if available). Once this PairRDD is created, a reduceByKey action is applied over it so that all the values (i.e., sequences) for each key are aggregated and then reduced based on a given user-dened function. The reduce function simply discards one of these similar sequences, keeping the one with the highest quality score (if available). Note that the group lters are consid- ered network-intensive operations as the reduceByKey action requirestoshufedataoverthenetworkinordertoaggregate all the values for the same key. +The implementation of trimmers and data formatters both rely on applying a single map transformation over the input RDD, performing the appropriate modications to each read depending on the specic operation. For instance, the func- tion executed by the map transformation in the case of TRIMLEFT (operation that removes a number of bases spec- ied by the user starting from the left) modies the string that represents the bases for each read using the substring Java method. Such modications must also be performed on the string that represent the quality scores when avail- able. An example of a data formatter is DNATORNA, whose function executed by map replaces each thymine base from the input DNA reads (represented by a `T' character) by its corresponding uracil counterpart (a `U' character) in the out- put RNA reads, using the replace method provided by Java. As a representative example, Fig. 4 shows the combination of both operations (DNATORNA and TRIMLEFT) over an input RDD containing four DNA reads. +Finally,theimplementationofthedifferentstatisticaloper- ations differ greatly. The COUNT operation was straightfor- ward to implement as it takes advantage of the count action provided by Spark that returns the number of RDD elements (i.e., sequences) in the dataset. However, the remaining two operations(MEANLENGTHandMEANQUALITY)require a more complex approach, being very similar for both of them.Toimplementthosefunctions,theaggregateactionwas selected. This action allows operating an RDD to generate a single nal result that can be of a different type than that + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +TABLE 1. Cluster node characteristics. TABLE 2. Main configuration parameters of Spark and HDFS. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets + +of the input RDD. To do so, the aggregate action takes two user-dened functions as arguments. The rst one operates once for each RDD element in a partition, so it is used to accumulate the results for each RDD. The second function combines all the intermediate results (one result per RDD partition) to produce the nal result that is nally returned to the main program. For instance, the rst function for MEANQUALITY computes the number of reads in each partition and the accumulated quality for all of them, while the second function combines all the accumulated qualities andnumberofreadsforallthepartitions.Next,thenalresult (i.e.,themeanquality)issimplyobtainedbydividingthetotal quality score by the total number of reads. +V. PERFORMANCE EVALUATION +The correctness of the results provided by SeQual has been assessed by checking that it provides the same outputs as PRINSEQ (a widely used and tested tool) when applying identical operations over the same input datasets. Therefore, the experimental evaluation has only focused on execution time. In order to check the correctness of the statistics (notavailableinthestate-of-the-arttools),wehavecompared the outputs of SeQual to the statistics provided by some text editors about the total number of lines and characters in the output les. +To evaluate the performance of SeQual, an eight-node multi-core cluster has been used for the experimental eval- uation. Table 1 shows the main hardware and software characteristics of each cluster node, which mainly consists of two Intel Xeon E5-2660 octa-core Sandy Bridge-EP processors at 2.2 GHz (i.e., 16 physical cores per node), 64 GiB of memory and one local disk intended to be used for both HDFS and intermediate data storage during the execution of the experiments. The cluster nodes are inter- connected through Gigabit Ethernet (1 Gbps) and Inni- Band FDR (56 Gbps). The system runs Linux CentOS release7.7.1908withkernel3.10.0-1062andtheJavaversion + +is Oracle JRE 1.8.0_241. According to these characteris- tics, Apache Spark version 2.4.4 was congured as shown in Table 2, which also contains the main relevant congu- ration parameters for HDFS (i.e., block size and replication factor).TheversionofHadoopdeployedintheclustertostore the input datasets in HDFS was 2.9.2. We have compared SeQual with PRINSEQ [10], one of the most popular quality control tools (see Section II), together with its multithreaded counterpart PRINSEQCC [24], using the latest available version of both tools. PRINSEQ was executed with Perl v5.16.3, whereas PRINSEQCC was compiled with GNU GCC v8.3.0 using the -O3 optimization ag. +Two publicly available datasets in FASTQ format obtained from the Sequence Read Archive (SRA) [48], [49] of the National Center for Biotechnology Information (NCBI) [50], [51] were used for the performance evalu- ation: SRR534301 and SRR567455. Table 3 shows their main characteristics. The number of reads (fourth column in the table) refers to the number of sequences per input le contained in the dataset, whereas the read length (fth column)isexpressedintermsofthenumberofbasepairs(bp) per sequence. We have selected these datasets as they repre- sent two different scenarios in terms of size and read lengths. +Table 4 shows the runtimes of PRINSEQ, PRINSEQCC and SeQual when processing those datasets both in single- and paired-end modes (i.e., processing one or two input les, respectively) for the following six representative operations: +• NONIUPAC:singleltertoremovethosereadswithone or more Non-IUPAC bases (any base other than `A', `T', `G', `C' or `N'). +• GCCONTENT: single lter to remove those reads with a percentage of Guanine (`G') and Cytosine (`C') lower or higher than a threshold specied by the user. +• DISTINCT: group lter to remove duplicate reads maintaining the ones with the highest quality. +• DNATORNA: data formatter to convert from DNA to RNA reads. +• COUNT: statistical operation to count the total number of reads in the dataset before and after performing any other operation over it. +• MEANQUALITY: statistical operation to compute the averagequalityofallthesequencesavailableintheinput dataset. +We have not assessed the performance of complex jobs that combine several operations in order to keep this section easy to read. Nevertheless, the improvement of SeQual over PRINSEQ and PRINSEQCC in this type of jobs would be at least the addition of the performance improvement in the individual operations. Note also that Table 4 shows + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 +146081 +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +TABLE 3. Public datasets used in the experimental evaluation. + +TABLE 4. Runtimes (in seconds) for PRINSEQ (using one core), PRINSEQCC (using one whole node, 16 cores) and SeQual (using 16 cores in one node and 128 cores in eight nodes) when performing different operations on two different datasets in single- and paired-end modes. Operations not available in PRINSEQ and PRINSEQCC are indicated with `'. + + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 + +R. R. Expósito et al.: SeQual: Big Data Tool to Perform Quality Control and Data Preprocessing of Large NGS Datasets +two runtime results for SeQual: using one whole node (i.e., 16 cores) and the eight nodes of the cluster (128 cores in total). PRINSEQCC was executed on the 16 cores of one whole node, while PRINSEQ only used one core, as it is a sequential tool. Statistical operations could not be com- pared as they are not available neither in PRINSEQ nor in PRINSEQCC.Moreover,PRINSEQCC doesnotprovidethe DNATORNA formatter. +As can be observed, SeQual is signicantly faster than the original tool PRINSEQ in all the scenarios even using only one node. When comparing SeQual with the multithreaded version (i.e., PRINSEQCC) using the same amount of hard- wareresources(i.e.,onewholenode),SeQualisfasterforhalf of the scenarios (it depends on the dataset and/or the opera- tion).Forinstance,SeQualisfasterthanPRINSEQCC forall the single-end experiments. Nevertheless, the main benet of implementing SeQual upon a cluster computing framework such as Spark is the possibility of exploiting the performance of multiple nodes in order to reduce even more the exe- cution time. When exploiting the whole cluster (8 nodes), SeQual is signicantly faster than PRINSEQCC for all the scenarios. More specically, our tool is on average around +23.6 and 8.3 times faster than PRINSEQ and PRINSEQCC, respectively, providing signicant speedups of up to 41.5x and 12.4x (both results achieved for the GCCONTENT lter operation when processing the SRR56 dataset). It is worth noting that the performance comparison has been limited to PRINSEQ and PRINSEQCC as, up to our knowledge, these are the tools of the current state of the art with the widest functionality(although,ascanbeseeninTable4,SeQualpro- vides even more operations). We have not compared to other tools such as Trimmomatic [20] as the number of operations that they offer is quite limited, and therefore in our opinion theirfunctionalityisnotcomparabletothatofSeQualoreven PRINSEQ. For instance, none of the operations that have been assessed in this experimental evaluation are available in Trimmomatic. +In order to measure the scalability provided by the Spark-based implementation included in SeQual, Fig. 5 reports the speedups obtained when varying the number of nodes from one to eight. The baseline is the execution time of SeQual for each operation when using one whole node, i.e., the speedups show the acceleration obtained thanks to exploitingmultiplenodescomparedtousingonlyone.Ascan + +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +VOLUME 8, 2020 +146083 diff --git a/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt b/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt new file mode 100644 index 0000000..3335519 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/93-A_Big_Data_Framework_for_Quality_Assurance_and_Val.txt @@ -0,0 +1,125 @@ +International Journal of Recent Technology and Engineering (IJRTE) +ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019 +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +A Big Data Framework for Quality Assurance and Validation +S. Nachiyappan, Justus S + depends purely on format. It can be in any structured or Abstract: Big data is a new technology, which is defined by unstructured format or it can be also a corrupted file. The data +large amount of data, so it is possible to extract value from the which are collected from the various sources like social media capturing and analysis process. Large data faced many challenges and digital media will be constructive and structured.It is dcoume ptolexvaitrioyauns d fepearfoturerms asuch nce. Mas anvoyluorgmae,nspizaetieodn, s vafariaceticohna,llveanlugees , tough to analyze the types of data. There are many types of +while facing test strategies for structured and unstructured data data like we categorize under structure and unstructured. It is validation, establishing a proper testing environment, working very difficult to analyze all types of dataThere are some with non relational databases and maintaining functional testing. flexible solutions for DBMS and RDBMS such as Oracle. +These challenges have low quality data in production, delay in The RDBMS is used for structured query language or SQL to execution and increase in cost. Reduce the map for data intensive manage, define, query, and update data. However, suppose business and scientific applications Provides parallel and scalable +programming model. To get the performance of big data data size is irresistible, it seems that RDBMS can handle hard, applications, defined as response time, maximum online user data and if done, the process becomes more expensive. It proves capacity size, and a certain maximum processing capacity. In that relational databases are not capable of managing large +proposed, to test the health care big data . In health care data data and some new technologies are needed for processing the contains text file, image file, audio file and video file. To test the data. Customary databases are accurate for structured data bpigre pdroactaessdinocgutesmetinnt,g abny dupsinost gprotwocesscoinngc etespts tinsuch g. Toacs labssigify dathtae and not for unstructured data. Big data contains the three +data from unstructured format to structured format using SVM characteristics such as volume/variety and velocity always algorithm. In preprocessing testing test all the data, for the called as 3V’s.Volume refers to an algorithm ability to deal purpose data accuracy. In preprocessing testing such as file size with a large amount of data. The scale of the data set is the +testing, file extension testing and de-duplication testing. In Post quantity for the clustering algorithms related to volume Proeasily tcessoinfegtch to thimepdlematae. nt the map reduce concept for the use of property, the higher the size, the handling outlines. The data +set is a collection of data set properties. Classification of +Index Terms: Preprocessing, Map reduce in Post Processing, features, nominal, ordinal, interval and ratio. Many clustering Structured data using SVM. algorithms support numerical and classification data. In large quantities, the size of the data set increases to maintain large +I. INTRODUCTION data, and the dimensions do not even increase. It's a curse of + Big data is new forms of information processing that size. In many clustering algorithms are capable of performing promotes large volume, high Speed with communication setbacks. Noise data can be grouped with data points. Variety assets, improved awareness, cost effective, decision making indicates the ability of a clustering algorithm to perform and process automation. Data represented large quantities is various sets of data sets, such as numerical, classification, nothing but Big Data. True, there is no specific size parameter nominal and ordinal. A criterion for clustering algorithms is a that defines this technology size. This is the safe way to set of data and cluster shape type. The size of the data set is measure the standard route of terabytes even pet bytes. The smaller or larger, but clustering algorithms support larger data data travels from various directions, and the speed and sets for large data mining. In cluster shape, the set of data volume will be terrible. Data will be replaced at a faster pace cluster is based on size and type shape. Velocity refers to the and therefore require more processing, especially for social calculation algorithm's calculations based on the complexity media feeds. But it is not the only medium to get information. of the time period of the clustering algorithm. If the It comes from different sources and shapes. If you go through algorithm's calculations are too low, nothing algorithm has the data you can find text files, audio files, images, video files, less run time. The algorithms run based on the Big O Option. presentations, sensor datas, data bases and log files. It The Artificial Neural Network algorithm is based on a cognitive approach, namely, a neural network without the +hidden layer. Although this approach could lead to poor quality in classification, it was easily selected for construction. As with the SVM model we created a perception classification for each binary combination. A node has an input layer of a node for classification. Perception has an output layer that represents a number of two categories that + Revised Manuscript Received on 30 July 2019. * Correspondence Author Nachiyappan S*, Assistant Prof (Sr.), SCSE, VIT University, Chennai. Justus S, Associate Professor, SCSE, VIT University, Chennai. © The Authors. Published by Blue Eyes Intelligence Engineering and Sciences Publication (BEIESP). This is an open access article under the CC-BY-NC-ND license http://creativecommons.org/licenses/by-nc-nd/4.0/ +belong to an example given +either 0 or a 1. +Using the full feature set rules for input layer increases the +computation, but stabilizes the feature set for comparison with Big Data is defined as datasets whose size is very huge and it the SVM algorithm. cannot be adopted in a traditional database tools to do all the +data processing. This is a specific definition which defines big +II. RELATED WORK data in terms of its context not the metric. This was discussed in Mckinsey’s report 2011 NIST has defined big data in some +BdepigenDatads udpooes n itsno t feameatunres thanat dit it is isa vderiffyerlarengtiatede volubmyethoe fd“Verata it y other way like “ big data is where the data acquisition data +volume and velocity or variety of data limits the ability to larbigge data data”in anliterd “atuhurge e andata”d th.erTe herare e arsoe mme andyefindefitioinitions wnshichfor perform the analysis on data. There are certain limitations that +plays a very important role. Big Data is Defined by IDC in which are needs to be addressed before processing it”. There 2011 : “Big data technologies describe a new generation of is also some other definitions which states that“software technologies and architectures, designed to economically libraries along with their associated algorithms that enable extract value from very large volumes of a wide variety of distributed processing and analysis of big data problems data, by enabling high-velocity capture, discovery, and/or across clusters of computer units” [1]. +analysis.''[1]. This explains the four characters or four V’s of +Big data. Volume, Variety, Velocity and Veracity of data. + +Fig1. Big Data Validation Service +There ia s work which is carried out by an industry regarding +big data testing, They have used the Big Data services for III. METHODDOLOGY +each and every V’s. Here four types of testing’s are done first +is to test the velocity, when the data comes inside the system A. File Categorization using SVM Algorithm +or storage the rate of speed which it is extracting and loading The file classification is a function that automatically into target system. Second one is the volume testing which separates the set of file extension from the classification from tests the amount of data in which the map reduce algorithms the predefined set. The concept of file classification is a are used in specific to their business needs. Third one is the standardized number of predefined categories or fractions. variety of data where the type of data is important to File classification can be defined as a function of differentiate like structured or unstructured. If its unstructured automatically classifying electronic documents for their data then the data has to be processed and it has to be commenting classes based on their file extension. Each converted into a structured format to process it. Fourth one is document is not exactly one, multiple or category. Using veracity of data where the truthiness of data is going to be the machine learning, learning classifications of targets, and very important part as the validation and verification is automating those classifications automatically. This is a concern. Fig1. Shows the big data validation services and how learning problem overseeing. Due to the overlapping of it is going to be processed. categories, each category is considered a separate binary +classification problem. +Classification helps to identify the correct category of extension and store it on the server. In this process we must domain in use, in this section I decided to divide the cloud file use the SVM algorithm. SVM Algorithm Main concept into four categories related to a particular file, which is split classification +into an image file, video file, text file, and document file. For +extraction. Then get the extension and classify the file + +Fig 2: Overview of Big data testing +File size and File extension Testing +A. De-duplication in Preprocessing Testing File size and file extension is the one of the pre process In big data preprocessing technique, we've got to check the testing. Data has been collected from varied sources and when de-duplication, zero file size, then the file extension. In collection information the info the information set and de-duplication testing ,To transfer file the user and also the uploading the data into the big information system and before CSP perform each de-duplications. The de-duplication process it, to validate the file is empty or not. If the file size is operation is a twin of that within the baseline approach. zero the file is not uploaded into the cloud server. Then the additional exactly, the user sends the file tag to the CSP for File extension validation helps us in many ways to confine the the file duplicate check. If a file duplicate is found, the user extension of file. In the file extension validation, to test the can run the POW protocol POWF with the CSP to prove the file size limit. For example, the image file contains some limit, file possession. If no duplicate exists, CSP stores the cipher if the size is exceeds it is not uploaded into the cloud +rtext with key and returns the corresponding pointers back to +user for native storage. In de-duplication on the opposite hand B. Map Reduce in Post Processing +of keeping the multiple information copies with an equivalent Map reduce is that this programming paradigm that enables file content, de-duplication eliminates recurrent information for large scalability across a whole lot or thousands of servers by keeping solely single copy and referring alternative in a very big data cluster. The Map reduce is straightforward redundant information thereto single copy. The to grasp for those that area unit acquainted with clustered de-duplication to eliminates duplicate copies of an equivalent scale-out data processing solutions. +file. De-duplication also can be used at the block level, that +eliminates duplicate blocks of information that occur in non +identical files. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Retrieval Number: A1912058119/19©BEIESP Published By: +DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org 2493 Sciences Publication +International Journal of Recent Technology and Engineering (IJRTE) +ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019 + Map-Reduce Validation represent the checking of key-value pairs generation and validate the map-reduce by applying numerous business rules. The term Map reduce truly refers to 2 separate and distinct tasks that big data programs perform. the primary is that the map job, that takes a group of knowledge and converts it into another set of knowledge, wherever individual components area unit countermined into rows (key/value pairs). The scale back job takes the output from a map as input and combines those information rows into a smaller set of rows. In map scale back, the scale back job is often performed once the map job. The Health Care big data area unit hold on within the server. Within the user will fetch information quickly we've to use the map scale back. +Table 1. Quality Attributes of Big Data +S.N +o Quality Variable Explanation 1 Data correctness The correctness of the data is validated with respect to format and data types. 2 Data consistency This validated the data consistency in various angles it also refers to data gathering from various locations. 3 Data accuracy This refers to closeness between the actual result and the expected result. Data from various sources are gathered and measured for its accuracy. 4 Data security Security is one if the important concern which need to be addressed and validated for the applications security and its integrity in various perspectives III. TEST PROCEDURE +In addition the quality factors which are discussed in this paper are as follows: +Reliability: +This assures the reliability of the big data applications under some specific conditions how the system is going to perform. When a specific load is given to the system how it behaves. Performance: How the big data applications performs in specific conditions and its also indicates about the performance of big data apps, such as availability and response time. +Correctness: +This speaks about the rightness of the big data applications. Scalability: +Scalability is the factor which speaks about the applications flexibility to scale. In some situations it should support to scale some huge data and huge repositories and storages from period to period. In the same way that the applications scalability should be tested for its purpose. +Security: +The validation of security regarding the big data application is done here at different stages. +IV. RESULT +A. Data Accuracy +Data Quality is one of the important factor which needs to be considered when we go for any testing the first one we need to discus is data accuracy. Data accuracy is the important factor of data quality. It is the data stored in that field is correct or not. In this implementation the medical data set of sample 100000 records are taken as the test data set. +In data accuracy is higher when compare to preprocessing. After the pretesting the each cluster provides the correct accurate result. Before preprocessing the data is stored in unstructured format after preprocessing the data is formed in to structured data and its formed into different clusters. Cluster type such as image, video, document and text. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Retrieval Number: A1912058119/19©BEIESP Published By: +DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication +International Journal of Recent Technology and Engineering (IJRTE) +ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019 +When the Quality challenges for Big data is being discussed the data quality of applications are also considered. The Quality variables of enormous information applications were secret nowadays. Traditional quality factors following robustness, performance and security can be valid in big data. Now coming to big data validations and the quality challenges this work discuss about the quality and validation process of big data. On comparing to customary software testing with the big data application testing process is entirely different and they are discussed in this paper in a brief manner. +The test procedure for big data is as follows. +1) Functional testing of big data, which includes rich test environments and domain-specific functions; +2) Non-function testing, includes performance, reliability, portability, Security, system consistency and Quality of Service +3) Big data Timing testing, checks timeliness of the system; Fig 3: Data Accuracy +4) Big Data feature testing, targets user related system +evolution and visualization +These four steps are followed in testing the big data +applications and feature testing which includes testing +continuously with real time testing. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Retrieval Number: A1912058119/19©BEIESP Published By: +DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication +International Journal of Recent Technology and Engineering (IJRTE) +ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019 +B. Data volume +In data volume, each cluster takes more storage space before pretesting. After that implementation of the pre testing the size of the data has been reduced. By means of de-duplication testing the duplicate data has been removed and the storage space has been reduced far better than before preprocessing. Because of the remove duplicate data, null value data and file categorization the storage space becomes low in each cluster. + +7. Quality Assurance for Big Data Applications – Issues, Challenges and Needs – Chuanqi Taq, Jearry Gao. 2016. +8. A Survey on Quality assurance techniques for big data applications, Pengcheng zhang, Xuewu Zhou, Jerry Gao, Chuanqi Tao. 2017. +9. Big Data - Testing Approach to Overcome Quality Challenges – Infosys White paper – Vol 11 no 1- 2013. +10. Big Data Testing Services, Infosys white paper – 2015 +AUTHORS PROFILE + Prof. S. Nachiyappan is working in VIT University Chennai campus, Completed his PG in Anna university in 2004 and his area of research is software engineering and Big Data. He is having 5 years of Industry Experience and 10 + Years of teaching experience. He is a member of ACM professional Chapter. +Dr. S. Justus Worked in various industries as project manager and researcher, he has an over all experience of 17+ years in both IT and Academic. He has guided more than 15 PG students for the project and has published various papers in national and international journals. He is a member of ISTE, IEEE, IAENG. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Retrieval Number: A1912058119/19©BEIESP Published By: +DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org Sciences Publication +International Journal of Recent Technology and Engineering (IJRTE) +ISSN: 2277-3878 (Online), Volume-8 Issue-2, July 2019 +Fig. 4: Data Volume +V. CONCLUSION +Big data information is as yet advancing and analyzers and testers have a huge duty to recognize new thoughts for performing tests in the field of Big Data. A standout amongst the most testing things for an testers is to keep the pace with industry's evolving elements. In many aspects of the test, technical details behind the tester scene are unknown, but testing of Big Data Technology is quite different. There is no need to be strong in a Tester Fundamentals test, but in order to analyze many performance barriers and other problems, you need to know the minute details in the design of database designs. Big data testers should first learn parts of the big data Eco System. In this paper 10000 sample data is used entered big data in the same cluster mode. We turn out with two preprocess and post process testing results. The future work in this is to test information with numerous group frameworks. + We have to give the more accurate result by using different algorithms. +REFERENCES +1. Avita Katal, Mohammad Wazid, R H Goudar, “Big Data: Issues, Challenges, Tools and Good Practices”, IEEE, 2013. +2. Xiaoming Gao, Judy Qiu, “Supporting Queries and Analyses of Large-Scale Social Media Data with Customizable and Scalable Indexing Techniques over NoSQL Databases”, 14th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, 2014. +3. Matthew Smith, Christian Szongott, Benjamin Henne, Gabriele von Voigt, “Big Data Privacy Issues in Public Social Media”, IEEE, 6th International Conference on Digital Ecosystems Technologies (DEST), 18-20 June 2012. +4. Vapnik (1995), The Nature of Statistical Learning Theory. Springer, Berlin +5. Burges, C.J.C. (1996). Simplified Support Vector Decision Rules. 13th International Conference on Machine Learning. +6. Pengcheng Zhang1, Xuewu Zhou1, Wenrui Li2, Jerry Gao3,4 (2017) A survey on quality assurance techniques for big data applications. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Retrieval Number: A1912058119/19©BEIESP Published By: +DOI: 10.35940/ijrte.B1912.078219 Blue Eyes Intelligence Engineering & Journal Website: www.ijrte.org 2495 Sciences Publication diff --git a/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt b/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt new file mode 100644 index 0000000..e12b618 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/97-An Improvement of a Checkpoint-based Distributed Testing Technique on a Big Data Environment.txt @@ -0,0 +1,203 @@ +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1081 +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +An Improvement of a Checkpoint-based Distributed Testing Technique +on a Big Data Environment +Bhuridech Sudsee, Chanwit Kaewkasi +School of Computer Engineering +Suranaree University of Technology, Nakhon Ratchasrima, Thailand, 30000 m5741861@g.sut.ac.th, chanwit@sut.ac.th + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1086 + +Abstract— The advancement of storage technologies and the fast-growing number of generated data have made the world moved into the Big Data era. In this past, we had many data mining tools but they are inadequate to process Data-Intensive Scalable Computing workloads. The Apache Spark framework is a popular tool designed for Big Data processing. It leverages in-memory processing techniques that make Spark up to 100 times faster than Hadoop. Testing this kind of Big Data program is time consuming. Unfortunately, developers lack a proper testing framework, which cloud help assure quality of their data-intensive processing programs while saving development time and storage usages. +We propose Distributed Test Checkpointing (DTC) for Apache Spark. DTC applies unit testing to the Big Data software development life cycle and reduce time spent for each testing loop with checkpoint. By using checkpoint technique, DTC keeps quality of Big Data processing software while keeps an inexpensive testing cost by overriding original Spark mechanism so that developers no pain to learn how to use DTC. Moreover, DTC has no addition abstraction layers. Developers can upgrade to a new version of Spark seamlessly. From the experimental results, we found that in the subsequence rounds of unit testing, DTC dramatically speed the testing time up to 450-500% faster. In case of storage, DTC can cut unnecessary data off and make the storage 19.7 times saver than the original checkpoint of Spark. DTC can be used either in case of JVM termination or testing with random values. +Keyword— Distributed Checkpointing; Apache Spark; Big Data Testing; Software Testing; +I. INTRODUCTION +THseEnsorsinc,reIoTasi ngdeviacnd es adind verstheity faof st-growelecitng roninumc debevirsc eof s, Internet users have been generating tremendous amount of +data recently. They are not only the large amount of data +——————————————————————— Manuscript received December 27th, 2017. This work was supported by Suranaree University of Technology, and a follow-up of the invited journal to the accepted & presented paper of the 20th International Conference on Advanced Communication Technology (ICACT2018), +Bhuridech Sudsee is with School of Computer Engineering, Suranaree University of Technology, Nakhon Ratchasrima, Thailand (corresponding author phone: +66-44-22-4422; e-mail: m5741861@g.sut.ac.th). +Chanwit Kaewkasi is with School of Computer Engineering, Suranaree University of Technology, Nakhon Ratchasrima, Thailand (e-mail: chanwit@sut.ac.th). +but their structures are also complex as well. This complexity makes the traditional data mining tools inadequate to manage today’s data [1]. +The MapReduce [2] programming model has induced the development of many frameworks such as Apache Hadoop [4], Map-reduce-merge [5] and Apache Spark [6], which aim to process data intensive tasks. Developers only need to rewrite their programming logic in the form of map and reduce functions in order to process data on a MapReduce framework. These functions will be automatically managed by the framework’s default configuration. This mechanism makes the MapReduce framework easy to use. At its simplest form, a MapReduce program usually starts by a map function creating key/value pairs from the input. These intermediate key/value pairs are then passed to a reduce function to produce the final results. The MapReduce model is parallel by nature. It is designed to allow developers to run MapReduce programs for high performance computing jobs using a commodity cluster, built from low-cost hardwares. With this kind of the cluster architecture, we can handle massive amount of data and process them on numerous cluster nodes without a single point of failure [3]. +Although the MapReduce model is easy to use for software development, but it is quite tricky to test software written by the MapReduce model. Software testing is a vital part of the development process. Testing is usually 25-50% of the overall cost [8]. We found that the current mechanism is not enough to assure quality for Big Data processing programs. Unit testing is a software testing technique which properly leads to better levels of quality. However, tools like Scalatest[9] or jUnit[10] have their own limitations to use with a MapReduce framework like Spark. For example, SparkContext and SparkSession objects must be instantiated only once for each running Java Virtual Machine (JVM) to avoid unexpected testing results [12]. Spark-testing-base [11] also does not have a testing mechanism for Spark. Without modification, it cannot work on a Spark cluster because if its inability to distribute class files across worker nodes. There aforementioned techniques are not suitable for Spark simply because they are not designed to test programs that distributelly process large amount of data. +Test-driven development (TDD) is a software development technique that helps developers to focus on +writing a specific test at a time. It additionally allows code improvement while preserving correctness according to the specification. TDD workflow consists of the following steps, (1) writing a minimum test (2) writing codes to just make the test passed, and (3) refactoring to remove unnecessary codes while still making the current test passed [13]. We call these steps a TDD workflow herein this paper. Applying TDD to data intensive programs is difficult due to the nature of workloads, which need to process on a cluster. So, developers require a special tool to help shorten each loop of the TDD workflow. +Spark has cache, persist and checkpoint methods to help mitigate job failure. These mechanisms however do not help software testing process much. The main reason is that a cluster state cached or persisted by them does not survive across runs of JVMs. A cluster state saved by the checkpoint method does survive on disk but unfortunately it cannot be retrieved back by a newly started JVM [14, 15]. +In this paper, we present Distributed Test Checkpointing (DTC), a technique that leverages the checkpoint technique to enhance software testing for data intensive jobs. With DTC, developers can increase productivity when testing their software on a distributed cluster repeatedly. DTC applied a hash function on each data partition of a Resilient Distributed Datasets (RDD) [18] to use an identifier. Modification of an RDD or a Dataset can be traced by the hashed number. The testcase that uses the RDD is also hashed at the bytecode level. Combining these techniques, DTC is found to reduce testing time and storage required by checkpointing significantly compared to the original Spark’s checkpointing technique. +The remaining of this paper is organized as followed. Section II discusses related works, including Apache Spark. Section III presents the design and internal mechanism of DTC. Section IV presents the system architecture of the cluster used by our experiments, and the experimental results. This paper then ends with conclusion and future works in Section V. +II. BACKGROUND AND RELATED WORK +A. Apache Spark +Spark is a data intensive processing framework focusing on in-memory data processing [6], which is implemented in the form of Resilient Distributed Dataset (RDD) [18]. RDD is designed to take care of the data flow and handle the processing mechanism. An RDD could be created using one of the following methods (1) reading data from file (2) parallelizing collection in the driver program (3) transforming from another RDD (4) and by transforming back from a persisted RDD [6]. An RDD comprises with two kinds of command, transformations and actions. A transformation command transforms an RDD to another RDD. These commands are map, filter and groupByKey, for example. Another set of commands are actions, which are collect and count, for example. An RDD keeps all previous transformation inside itself. This direct acyclic graph of transformation is known as lineage. The beginning of the real computation occurs only when an action is called. This is the lazy evaluation nature of Spark. +A mechanism for failure recovery that helps an RDD to resume the processing without re-computation from scratch are methods such as cache, persist and checkpoint. The cache method uses persistency at MEMORY_ONLY, while the persist method has several levels of persistency. The checkpoint method, in contrast, uses the technique which save data onto a reliable storage, such as HDFS, Amazon S3 or Ceph. An RDD is usually cached or persisted during its computation to avoid re-computation previous steps [15]. +The checkpoint technique is also applicable for Spark Streaming because it truncates the internal lineage, so the RDD does not need to knowledge of its parent. However, this mechanism is not designed for software testing. The re-computation is still required to start from the beginning when the testcase is re-run. The rerunning of the testcase destroys a Block Manager inside an Executor. This Block Manage is responsible for keeping cached and persisted data. The new Driver program and the testcase therefore is not able to access the location of checkpoints. +In addition, Spark has introduced the Dataframe API in 1.3 and Dataset in 1.6. Both abstractions can be used interchangeably because Dataset[Row] is the type safer version of DataFrame. A dataset is also convertible to an RDD. In the case of DTC proposed in this paper, we read and write data directly without triggering any computation of related RDDs. +B. Debugging framework for Spark +A technique used to improve quality of the software is debugging. Developers usually debug to observe certain set of variables they are interested. However, in the Data-intensive Scalable Computing (DISC), the debugging process is difficult as data are computed distributedly on a cluster. +BigDebug [7] is a tool designed to helps Spark’s developers deal with debugging a Big Data program. There is a downside that the tool requires user’s interaction during the debugging process. Those interactions make the debugging more difficult than those of normal programs because the Big Data programs are distributed by nature. Moreover, a BigDebug program cannot tackle the problem when the RDD being debug requires changes. The whole debugging process needs to start over in that case. In case of the developer changing codes on-the-fly, the RDD will become in-consistent as some partitions of the RDD has been processed by the old version of codes, while other partitions will be processed by the new codes. BigDebug support Spark up to 1.2.1 as the time writing. +C. Checkpoint implementation for Spark +Researchers have been employed the checkpoint of Spark in many ways to improve its efficiency, as follows. +Flint [26] was created atop the original checkpoint technique of Spark. It aims at applying checkpoint and store their data on transient instances to reduce the VM usage cost. A transient instance in a kind of low-cost computing unit, which can be recalled anytime by its cloud provider. Flint solves this problem by writing an RDD’s partitions to an HDFS, which is operated on on-demand instances. We found that this implementation lacks a mechanism to prevent re-calculation when JVM is terminated. In addition, +their checkpoint will be saved automatically so developers need to prepare a huge amount of space in order to prevent the full of storage, which can lead to the failure of the whole system. +TR-Spark [27] implements the similar approach as Flint. The difference is that TR-Spark allows fined-granularity checkpoints at task-level. By leveraging this level of checkpoints, the storage usage cloud be reduced in comparison to checkpoint the whole RDD. However, TR-Spark makes it difficult to use as developers need to collect the information of VM failure to let it know the failure probability. TR-Spark does not deal with changes of the Driver program. +Automatic Spark Checkpointing (ASC) [25] was designed to help analyze the trade-off between RDD checkpointing and its restore. ASC performs this computation by estimating them from an RDD lineage. Nevertheless, this technique does not support checkpoint across JVM termination. It also lacks the ability to recognize the similarity or identity of an RDD. +Spark-flow [24] aims to mitigate the effect of JVM termination for checkpoint restoration. It makes use of Distributed Collection (DC), a library similar to the Dataset API. DC is able to analyze an RDD at the bytecode level with ASM. It can identify the location of checkpoint calls, inside an anonymous function. It also uses the MD5 hash function to help detect changes at the bytecode level. However, DC has some downside as the following. First, when calling checkpoint on a DC, the data is re-read again after checkpointing. Second, when restoring from checkpoint, the action count will be triggered, so the re-computation kicks in. Finally, computation is mainly done on the Driver machine, so the mechanism is actually not distributed. This often causes Out-of-Memory exception inside the Driver program and it stops working. +1 val data = sc.parallelize(Array(1,2,3,4,5)) 2 val distData = data.map(x => (x,1)) +3 distData.dtCheckpoint() +4 distData.count() +5 distData.collect() +Fig. 1. Example of a dtCheckpoint call on an RDD + +Fig. 2. The dtCheckpointing mechanism inside DTC +III. DESIGN AND IMPLEMENTATION +Spark stores the RDD transformations in the form of a lineage graph a.k.a. the logical execution plan. When an action is triggered for a certain RDD, its job will be submitted to the DAG Scheduler to transform the RDD’s lineage into a directed acyclic graph, whose a vertex is an +RDD partition and edge is a transformation. After that the staging process will be kicked in. This staging process will be started from the final action going backwards to the beginning of the RDD. However, in the real execution, the process will be performed from the beginning of the RDD forwardly to the final action. After the staging, the system obtains a set of Stages and Tasks. +A checkpoint of an RDD however must be done before the first action is performed. From the source code in the Fig. 1, when a program starts to process an array of integer 1 to 5, the array will be passed as a parameter of method parallelize of class SparkContext. This result in a ParallelCollectionRDD stored in variable data. At line 2, each element from the data RDD is mapped with 1 using the map method as a key/value pair. The result is a MapPartitionsRDD stored in variable distData. At line 3, method dtCheckpoint is invoked. Please note that the original Spark and DTC both use the lazy evaluation mechanism, this means that the checkpoint method only marks at a certain point over the DAG, where checkpoints will happen there. At line 4, command distData.count() is the first action. When this first action is triggered, the checkpoint is not yet created. The computation then is started from the beginning of the RDD to the mark point. After that, the checkpoint is stored at the first upper directory level as a hash value generated by the mechanism of DTC. At the line no 5, method distData.collect() is invoked as the second action. The system will then check backwards from the action to the beginning of the RDD. This time the system will find a checkpoint already existed because there is a directory whose name matches with the hash. When the DAG Scheduler starts to transform the lineage, it uses the data directly from the checkpoint without re-computation. Please also note that action count() and collect() belong to the different jobs. The result computed by count() will not be included as an input for collect(), despite their order of execution. +In Scala, it allows us to implement a new feature for a class by creating an Implicit Class then mixes it in to the existing classes, like RDD or Dataset. The DTC mechanisms proposed in this paper are implemented using that technique. With DTC as an Implicit Class, developers could still use all existing properties and behavior of an RDD, while having an additional method from DTC. Developers are also able to upgrade the Spark framework to the newer versions without rewriting this mechanism. DTC is more suitable for testing than Spark-flow, which has many abstraction layers. These abstraction makes it difficult to enhance capability of Spark-flow. +A. DtCheckpointing +This mechanism works when the method dtCheckpoint of an RDD or a DataSet is called. This call marks an RDD and also starts the Hashing RDD mechanism to obtain a directory path from hash transformation. If there is no directory matched the hash value, it means that the system never created that checkpoint. After the creation of the directory content of the RDD will be stored inside of it. But if the directory exists, the system will read the content as the data of the RDD. In Fig. 2, when an RDD is created using the parallelize method and is transformed with map followed by an invocation of dtCheckpoint. The sub-system +DtCheckpointing kicks in to mark points in the RDD for later storing when action count is called. +We usually perform the test on a Spark Cluster with SBT, which is an interactive build tool to help develop software with Java or Scala. SBT allows us to write a build file using Scala-based Domain Specific Language. It manages a program dependency with Apache Ivy. With DTC, we modify test commands of the SBT namely test, test-only, and test-quick to support not only the local execution but also in the real working cluster. We solve the problem of ClassNotFoundException and NoClassDefFoundError by making a fat jar via custom SBT task. So, we introduce testOnCluster for testing every testcase, testOnlyOnCluster to test a specific testcase, and testQuickOnCluster to test a certain testcase which may be failed from last time, or never tested or need re-computation. Our modification to SBT allows the new mode of testing on the real cluster. +B. Hashing an RDD +Hash function is a one-way function which can be used to check data modification. Eve one bit of data is changed this function notices that modification. In this paper, we will compare MD5, SHA-1 and SHA-256 because these algorithms have various speed of hash and resource usage. +This technique of the DTC framework is able to track the change of an RDD because the generated transformations. So we can use this mechanism to detect modification of any transformation back to the original RDD. When an action is triggered, the DTC framework detects all RDD dependencies and prepares a clean bytecode available by the CleanF property of the RDD, following by preparing other Java bytecode’s files which related to the dependencies. In preparation stage, DTC uses ASM, a tool to manage a Java bytecode [17], which Scala internally uses it for the compilation mechanism. With a ASM, the DTC’s hashing an RDD mechanism can access Java class file at runtime and de-serialize them for reverse engineering propose. DTC needs to remove some brittle information such as LINENUMBER or serialVersionUID from a class file. With this information filtered out, we can detect changes of an RDD or DataSet even when the line numbers have been changed. +The result of class file analysis in preparation stage, after unnecessary dependencies was eliminated, these dependencies will compute hash number and input data, which the origin of an RDD will compute hash number also. The computation is distributed computing with Spark’s accumulator in the first level hash number computation will +SET hash_array = empty array of string +IF (HASH_INPUT_DATA = true) THEN + READ each data partition from (RDD or DataSet) COMPUTE hash of each data partition + APPEND hashes to hash_array +ENDIF +Fig. 3. Pseudo codes of the mechanism of Hashing an RDD +compute hash number of input data for every partition, and then collect and reorder result because unpredictable computation time. After that, the DTC will compute hash number of sorted hash number again. Fig. 3, illustrates the steps of hashing mechanism please note that the computation of input data is an option that can specify with dtCheckpoint(true). +IV. EXPERIMENTS +A. Cluster configuration +The experiments presented in this paper have been conducted on a Spark cluster consisted of 10 nodes. Each node is an Intel Core i5-4570 Quad-core with 4 GB of RAM. The drive node is an Intel Xeon E5-2650V3 Deca-core with 8GB of RAM. We use Apache Spark 2.0 for the experiments along with Ceph as the distributed file system over these 10 nodes. The Ceph storage is 10 TB. The system architecture is illustrated in Fig. 4. +TABLE I +COMPUTATION PROGRAMS AND INPUT DATA OF EXPERIMENTAL Program Input dataset +Wordcount 31 GB of Wikipedia +Triangle Counting 875,713 vertices and 5,105,039 edges PageRank 875,713 vertices and 5,105,039 edges Pi Estimation 109 times + +Fig. 4. The cluster architecture used by the experiments +B. Methodology +For the experiments, we use a MapReduce program Wordcount on 31 GB data dump of Wikipedia, Triangle Counting with Google Web Graph [28], PageRank with Google Web Graph and the last one is Pi Estimation with one billion times. Each program with its input dataset is shown in Table I. The Wordcount Program splits sentences into array of words and counts them using both RDD and Dataset (or DC in case of Spark-flow) with different checkpoint mechanisms. We tested each checkpoint mechanism 10 times continuously and measured both in space and time perspectives. Moreover, we tested 5 additional with JVM termination. Then we started the JVM again to test the recovery process of checkpoints. + Table II shows the comparison of checkpoint mechanism properties. If we do not use checkpoint, the system does not have the fault tolerance property. If we use the original Spark, it is not suitable for testing because its checkpoint mechanism does not work well in the test environment. In case of Spark-flow it does not work on the cluster environment out-of-the-box. DTC, on the other hand, is designed to address these problems in the testing + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1088 +TABLE II +FEATURE COMPARISON BETWEEN CONFIGURATIONS +Failure More abstraction Prevent re-calculation Suitable for +Method Cluster +tolerance layer from beginning Testing +No-Checkpoint No No No No Yes Spark Original Yes No Yes Not Suitable Yes Spark-flow Yes Yes Yes Yes No DTC Yes No Yes Yes Yes +TABLE III +THE COMBINATION OF ALL EXPERIMENTAL CONFIGURATIONS +Type Checkpoint Data Format Hash Algorithm Configuration RDD DataSet DC Java Kryo Avro Parquet MD5 SHA1 SHA256 +No-checkpoint √ √ - - - - - - - - Spark Original √ √ - √ - - - - - - Spark-flow - - √ - - - √ √ - - DTC √ √ - √ √ √ √ √ √ √ +environment. So, DTC provides the better environment to that we can multiply by 4 to roughly results Pi number. We support unit testing. tested 5 cases then stop the JVM, after that we re-run these +Table II shows a brief differentiation of comparison 5 cases again on RDD. +method that we will experiment. That meant, if we have no +C. Experimental results (consecutively 10 cases) +checkpoint it will lack failure tolerance, the Spark original +checkpoint insufficient to testing. The Spark-flow push From the experiments, we start discussing in the case of developer in more abstraction layer by create a higher level no hashing input data, denoted not-hashinput by running of a DataSet and it not work on cluster naturally. In Table consecutively 10 cases. In this case the input will not be III, we show the combination of all experimental verified by hashing functions before the program starts. We configurations. Accordingly, the DTC introduce to rectify assume that development and during the tests. The that plain. experimental results are show in Fig. 5. At the first run, +We compared with MapReduce Wordcount algorithms DTC and the original-checkpoint mechanism are +on Wikipedia 31 GB with separating each word from each all slow with insignificant difference. The other with white space. And then, we filtered only word DTC-Java-SHA1 is slowest. It uses 636 seconds slightly +occurred more than 10 million times, after that asserted TABLE IV +with the most word occurred. We consecutively repeated CHECKPOINT’S STORAGE USAGE OF AN RDD +these steps 10 cases and performed testing on 5 cases then Storage usage Size Unit stopped the JVM. After that we re-run these 5 cases again No-checkpoint 0 MB +on both RDD and DataSet. Spark original checkpoint 9.870 MB +Next, we compared with Triangle Counting Program DTC-Java-with-hash 0.987 MB +which gathers the number of vertices whose has two DTC-Java-without-hash 0.987 MB adjacent vertices with an edge between them. And then DTC-Kryo-with-hash 0.501 MB perform PageRank Program to ranks members onto the DTC-Kryo-without-hash 0.501 MB +graph. Input of these programs came from Google Web +Graph. with 875,713 vertices and 5,105,039 edges, testing TABLE V +on 5 cases then stop the JVM, after that re-run these 5 cases CHECKPOINT’S STORAGE USAGE OF DATASET +again on RDD. Storage usage Size Unit Finally, we compared the Pi Estimation program by using No-checkpoint 0 MB Monte Carlo algorithm shows in (1) [29]. Spark original checkpoint 9.860 MB DTC-Avro-with-hash 0.987 MB +DTC-Avro-without-hash 0.987 MB 2%/3&4*ℎ/ 5,)* -)%-./ DTC-Parquet-with-hash 0.993 MB +ℙ($%&'()*ℎ), -)%-./) = 2%/3&4*ℎ/ 6753%/ DTC-Parquet-without-hash 0.993 MB Spark-flow 9.930 MB +∬{)*+,*-.}1 %&%' += +∬{0.-),,-.}1%&%' different from original-checkpoint. The π (1) no-checkpoint configuration does not have this startup += 4 overhead, so it run at 136 seconds on average. For the first +The algorithm randomly generated two values which run, All DTC and the original-checkpoint are 4.7 represent to coordinate x and y of unit circle (so both x and times or slower than the no-checkpoint mechanism. y are between -1 to 1). After that, trying to addition However, all DTC configurations are significantly faster in between square magnitude of x and square magnitude of y the subsequence runs. +and if that result less than or equal to 1 will be count as fall Fig. 6 shows the comparison between cases of applying in the unit circle. That number will use to represent π/4, so hash functions over input data to allow the system to detect + +Fig. 5. Comparison of checkpoint time of RDDs without hashing inputs using the Fig. 6. Comparison of checkpoint time of RDDs with hashing inputs using the +Wordcount program. (10 cases consecutively) Wordcount program. (10 cases consecutively) + +Fig. 7. Comparison of checkpoint time of DataSet,including Spark-flow without Fig. 8. Comparison of checkpoint time of DataSet,including Spark-flow with +hashing inputs using the Wordcount program (10 cases consecutively). hashing inputs using the Wordcount program (10 cases consecutively). + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 +changes of the input. It shows that DTC mechanisms are slower than no-checkpoint and original-checkpoint only in the first run. In the subsequence runs, DTC mechanisms make the test s faster than those run by no-checkpoint and original-checkpoint. We found that DTC-Kryo-SHA1 is slowest in the first run. It uses 908 seconds on average, while no-checkpoint uses 136 seconds and original-checkpoint use 636 seconds. +In the subsequence runs, DTC mechanism uses around 85 seconds on average. It is significantly faster that both no-checkpoint and original-checkpoint, which +is 60% +In the first run with hash input, the fastest DTC mechanism is DTC-Java-SHA256 it is 480% slower than no-checkpoint and 24% slower than original-checkpoint. In the subsequence runs, this mechanism is 40% faster than no-checkpoint and 590% faster than original-checkpoint. Other cases +are in similar trends. +In case of DataSet, we found the similar trends as the case of RDD. During the first run DTC mechanisms are slowest, and significantly faster in subsequence runs. Fig. 7 and Fig. 8 show the comparison between checkpoint mechanisms for the DataSet without hashing input and with hashing input, respectively. We also include Spark-flow +in these experiments. We found that Spark-flow uses 752 seconds at the first run, while DTC-Parquet-MD5 +uses 606 seconds, so DTC is 24% faster than Spark-flow. In case of hash input data, DTC is 40% slower than Spark-flow for the first run. However, in the subsequence runs, DTC dramatically reduces time spending, according aforementioned trends. +The mechanism of checkpoint usually requires use of storage. The storage usage comparison is then presented in Table IV. According to the table, DTC with Java serializer uses the storage only one-tenth of those used by the original Spark checkpoint. In case of DTC with Kryo, it uses storage only 5% of the original-checkpoint. +This storage usages are similar for DataSet. According to Table IV, DTC with Avro format uses only 10% of the original storage. In case of DTC with Parquet format, it uses only 11% of the original storage. Comparison of these results with Spark-flow, we are roughly at the same ration. +DTC is designed to allow re-usability of RDDs and DataSets. It can traverse and detect change of the dependency of each RDD or a DataSet. From the experiments, we have found that DTC has a larger overhead than the mechanism of the Original Spark only when a testcases are in first run. When the testcases are in the later runs, DTC makes them 5-6 times faster than running by the Original Spark and Spark-flow. Moreover, DTC uses +disk space 8-9 times less than both implementations as shown in Table IV and Table V. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1089 +(a) (b) +Fig. 9. Comparison of checkpoint time of RDDs using the Wordcount program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs. +(a) (b) +Fig. 10. Comparison of checkpoint time of DataSet using the Wordcount program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 +D. Experimental results (5 cases with JVM termination) +In this section, we discuss the experimental results in case of running 5 cases consecutively, then stopping the JVM, after that the experimental cases were re-run again. Its behavior on different frameworks were observed. +Firstly, we discuss the result of the Wordcount program on RDD. We found that DTC-Java-SHA256 used 542 seconds at the first run in case of running if before stopping JVM, so DTC is 9% faster than original-checkpoint which uses 596 seconds. After stopping JVM or closing the program then re-running the test cases, DTC with all settings used only few seconds to recover checkpoint, while other frameworks used hundreds of second, as showed in Fig 9. In Fig 9, the dashed line is the first running before JVM terminating and the solid line is the second running after restarting the JVM. +In the case of DataSet shown you in Fig 10, the dashed line presents the first run of 5 cases. We found that the original-checkpoint used 654 seconds, while Spark-flow used 585 seconds. So, Spark-flow is 11% +faster than the original one. But DTC with the DTC-Parquet-MD5 configuration, it used 595 seconds, 9% faster than original-checkpoint. However, in +the second run of 5 cases after restarting the JVM, as the solid line, the results show that the original-checkpoint used 697 seconds and Spark-flow used 545 seconds, while DTC with any configuration used just few seconds. +Fig. 11 shows the results comparing between frameworks using Triange Counting Program, In the case of not applying hashing to the input data, we showed that in Fig 11 (a), no-checkpoint, original-checkpoint and +DTC used almost the same amount of time for the first runs. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 +(a) (b) +Fig. 11. Comparison of checkpoint time of RDDs using the Triangle Counting program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs. +(a) (b) +Fig. 12. Comparison of checkpoint time of RDDs using PageRank Program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1090 +For the second runs after restarting the JVM, we found the same trend as we were discussing earlier. DTC with all configurations could reduce time for testing to just a few seconds. Due to inputs were in the form of graph (vertices and edges) as shown in Fig 11 (b), the underlying mechanism of the Spark Framework tries to perform operations efficiently by casting the partition of the input to class ShippableVertexPartition. In the research work reported in this paper, DTC does not import to support to read this kind of data type. Fig 11 (b) shows that DTC with all configurations could not help reduce time much. All frameworks use the same amount of time processing the data. +In Fig 12 shows the experimental results obtained from running the PageRank program. PageRank is a program that +processes graphs. It used the same set of inputs as the previous experimental, Triangle Counting. In Fig 12 (a), it shows the results in the case of not applying hashing to the input data. We found that in the first testcase of the first run, the results of DTC with Java serialization, with either MD5 or SHA1 as the hash function, used 204 seconds, while the original-checkpoint used 214 seconds. In +this comparison, DTC could speed up by 4%. For the rest of testcases, times spent by DTC is cut down to just a few seconds. In Fig 12 (b), we also found the same problem as of the Triangle Counting program. This was the result of hashing input. +Finally, we discuss the results of the Pi Estimation program. In Fig. 13, we showed tenor of comparing frameworks. For the first testcase of the first run, we found + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 + +(a) (b) +Fig. 13. Comparison of checkpoint time of RDDs using Pi Estimation Program (5 cases with JVM termination) while (a) without hashing inputs and (b) with hashing inputs. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) +ICACT Transactions on Advanced Communications Technology (TACT) Vol. 7, Issue 1, January 2018 1091 +that without hashing inputs, the DTC-Kryo-SHA256 used 114 seconds, while the original-checkpoint used +135 seconds as shown in Fig 13 (a) DTC was 18% faster in this case. In the consequent testcases, DTC could cut the running time significantly. +In case of hashing inputs, we found the same trend as shown in Fig 13 (b) as the previous results. DTC used processing time almost the same as original-checkpoint at the first testcase then dramatically speed up by using only a few seconds for testing each testcase. Moreover, the DTC framework can be detected in case of random values, so that spark developers can reproduce the input which causes software is issues. +V. CONCLUSIONS AND FUTURE WORK +The experimental results have obviously shown that DTC is suitable for improving productivity for unit testing in Big Data applications in terms of time consumption and storage usage. We can perform testing for Big Data either on a local or a cluster. DTC could trace change in testcases with random values. Unfortunately, we found that DTC could work well in case of graph algorithms such as Triangle Counting or PageRank due to spark framework cast partition of an input to ShippableVertexPartition. So that one of limitation the DTC is input datatype. We are researching in potential mechanisms which can be used for increasing speed of testing and reducing storage usages such as cache and persist. The JVM configurations are ones of tuning parameter we are focusing. These subjects are being studied. +REFERENCES +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +Copyright $Ò 2018 GiRI (Global IT Research Institute) diff --git a/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt b/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt new file mode 100644 index 0000000..b5717a0 --- /dev/null +++ b/docs_to_import/rsl_oliveira2024/99-Quality Control Framework of Big Data for Early Warning of Agricultural Meteorological Disasters.txt @@ -0,0 +1,174 @@ +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +Created with an evaluation copy of Aspose.Words. To remove all limitations, you can use Free Temporary License https://products.aspose.com/words/temporary-license/ +Quality Control Framework of Big Data for Early Warning of +Agricultural Meteorological Disasters + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +Jiale Li +College of Ecology and Environment, Institute of Disaster Prevention +Sanhe, Hebei, China +lijiale_cumtb@126.com +ABSTRACT +Agricultural meteorological disasters, including floods, droughts, dry hot winds, low temperature chills, typhoons, hail and continuous rain, can lead to significant reduction in agricultural output. Big data platform for early warning of agricultural meteorological disaster is the basis of business operation system for early warning of agricultural meteorological disasters, and the data quality is an important guarantee for success of the early warning. Quality control of big data for early warning of agricultural meteorological disaster involves names of data sets, metadata, data documents and content of data sets. The quality control for contents of data sets is divided into quality control of attribute data and that of spatial data, and quality control of spatial data is divided into quality control of vector data and that of raster data. Methods for data quality control are divided into fully automatic, semi-automatic and full manual control methods. +CCS CONCEPTS +• Social and professional topics ~ Quality assurance • Hardware ~ Printed circuit boards • Computing methodologies ~ Machine learning +KEYWORDS +agro-meteorological disasters, early warning, big data, quality control, framework. +1 Introduction +Meteorological disasters are atmospheric natural disasters that cause harm to human life and property, cause losses to social and economic development, and have serious adverse effects on human production and life [1]. According to statistics from the United Nations World Meteorological Organization, meteorological disasters account for 60% of all natural disasters [2]. China is a country with frequent natural disasters, and food +Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from Permissions@acm.org. +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China © 2019 Association for Computing Machinery. ACM ISBN 978-1-4503-7150-6/19/07…$15.00 https://doi.org/10.1145/3349341.3349371 +Shunbao Liao† +College of Ecology and Environment, Institute of Disaster Prevention +Sanhe, Hebei, China +liaoshunbao@cidp.edu.cn +production is greatly affected by natural disasters. About 70% of natural disasters are resulted from meteorological disasters [3]. +Agro-meteorological disasters are a general term for adverse weather or climatic conditions that occur in agricultural production processes and result in significant reduction in agricultural production, including floods, droughts, dry hot winds, low temperature chills, typhoons, hail and continuous rain [4]. Agro-meteorological disaster prevention needs to know a lot of information such as weather forecast, weather conditions, the scope of meteorological disasters, duration, intensity of disasters, population distribution of affected areas, number of large livestock, crop planting area, water irrigation status, etc. This information includes both spatial geographic information and a large number of weather attribute information inseparable from space [5]. Therefore, it is an effective method to combine high-tech such as remote sensing and GIS and conventional disaster monitoring and evaluation methods to monitor and evaluate major agrometeorological disasters [6]. Real-time quality control of meteorological data is of great significance for meteorological support of aviation activities and disaster prevention and mitigation [7]. +Data Quality Management is to improve data quality by refining and enhancing the management level of the organization. The management of data consists of a series of activities, which involve identification, measurement, monitoring, and early warning of data quality problems. These problems could be triggered off in one of the phases, which range from data planning, collection, storage, sharing, maintenance, and application to data destruction. Data quality assessment and management are generally measured in several dimensions, including completeness, conformity, consistency, accuracy, uniqueness, and integration [8]. +2 Big Data Platform for Early Warning of +Agricultural Meteorological Disasters +2.1 Platform Structure +Big data platform for early warning of agricultural meteorological disasters and model system are the basis of early warning service operation system (as shown in Figure 1). Users call data from Big data platform and early warning models through the interface of early warning service system for agricultural meteorological disasters to realize the early warning of agricultural meteorological disasters. At the same time, the + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +business system stores the user's early warning results into Big data platform for other users to query. +User1 User2 …… User n +Operation system for agricultural meteorological disasters warning service (Interface) +Big data platform for Models system for agricultural meteorological +disasters warning disasters warning +Basic data for agricultural +meteorological disasters Model build/selection +warning +Figure 1: Operation business system for early warning service of agricultural meteorological disasters +The quality control of big data for early warning of agrometeorological disasters refers to data quality inspection and data correction that arise in the process from basic data to Big data platform for agrometeorological disasters warning. However, the data quality issues that occur in the process from user operation results to Big data platform for agrometeorological disasters warning will not be discussed in this paper. +2.2 Quality Control Objects +Big data are divided into structured data and unstructured data, and the quality control of early warning big data for agricultural meteorological disasters is mainly for structured data. The large database of agricultural meteorological disaster warning consists of attribute database and spatial database. The attribute database includes real-time observation database (such as meteorological observation database) and non-real-time observation database (such as statistical survey database, historical climate database, etc.). The spatial database includes spatial vector database and spatial raster database. It was stipulated in this study that the object of quality control for big data of agricultural meteorological disasters warning was a data set, which was, a two-dimensional table in relational database, coverage in vector database or a grid layer in raster database. +Quality control objects in Big data platform for early warning of agricultural meteorological disasters are listed in Table 1. +Table 1. Quality control objects in the big data platform + +Data types at level 1 Data types at level 2 Quality control objects Examples Attribute data Real-time observed data Tables in relational database real-time observed meteorological data Non-real- time observed data Tables in relational database statistical survey data, historical climate data Spatial data Vector data Vector layers Land use, boundary Raster data Raster layers DEM, NDVI 3 Contents of Quality Control +According to data management strategy and actual situation of data, quality control of big data for agricultural meteorological disaster early warning was carried out at different levels, including quality control of data set names, metadata, data documents, and content of data sets. The quality control of content of data sets was divided into quality control of attribute data and that of spatial data, and quality control of spatial data was divided into quality control of vector data and that of raster data. +3.1 Quality Control of Data Set Names +Big data for agro-meteorological disaster warning are spatiotemporal data. The purpose of normalization of data set name is to let users know the spatiotemporal range, detail level and thematic content of data set by names of data sets, that is, the basic information about a dataset can be obtained by its name. +Therefore, dataset names of big data for agrometeorological disaster warning should contain four elements, which are spatial scope (region), time range, detailed level and thematic content of data sets, but however the order of these elements can be adjusted according to the habit. The time range refers to the time of data acquisition, not the time when the data is published or released. The detail level of data may be scale of vector data, spatial resolution of raster data, or administrative division unit of statistical survey data. For the normalization of data set name, the example is as follows: +Example: National 1:100,000 land use data (2015). Where "national" is the spatial range of data; "1:100,000" refers to the detail level of data; "Land use" is the thematic content; "2015" represents the time of the data. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +75 +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +3.2 Metadata and Data Documents +Metadata is data about data. It is information that describes a dataset. Metadata generally describes data sets by standardized entries, which are normative and uniform. Metadata can help users understand and apply data sets. Without metadata, users sometimes cannot fully interpret data. Therefore, metadata + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +conforming to norms and with sufficient information is an important means of data quality assurance. +A data document is a file that describes a data set. Compared with metadata, data documents do not follow a strict coding specification, but they are sometimes critical to the user's understanding of data. For example, in some data sets, attribute elements are represented by codes consisting of letters and numbers, the description of the codes (including meaning, unit, etc.) is particularly important. Both metadata and data document are important means of data quality control, but they have their own characteristics. Metadata is more standardized, but the description of datasets by metadata is sometimes not specific. Data documents are not as standardized as metadata, but their description may be more specific. Therefore, metadata is relatively suitable for the standardized management of data sets, and data documents are more suitable for the interpretation and application of data sets by users. From the perspective of data quality control, either metadata or data documents should accompany data sets. It's best to have both. +3.3 Quality Control of Contents of Data Sets +Quality control of data set content is divided into quality control of attribute data and that of spatial data, and quality control of spatial data is divided into quality control of vector data and that of raster data. +3.3.1 Quality Control of Attribute Data. Attribute data is also +called two-dimensional tabular data, which is a table in a relational database. The attribute data in the agrometeorological disaster warning database mainly includes real-time and historical meteorological data, and statistical survey data. +3.3.2 Quality Control of Real-Time and Historical Meteorological +Data. For those kinds of data, meteorological stations are generally used as recording units, and the main contents of quality control are as follows: +(a) Quality control of weather station codes: It is mainly checked whether the codes of weather stations are within the national standard codes database and whether the corresponding relationship between the codes and the names of weather stations is correct. +(b) Quality control of spatial coordinates of weather stations: it is checked whether the longitude, latitude and altitude of weather stations are correct. +(c) Quality control of time elements: it is checked whether the attribute value and the format of time for each record is correct. +(d) Missing value check: checked contents include missing values for the fields that should have values, the percentage of missing values, and whether the missing values can be interpolated by some means, and so on. +(e) Outlier check: according to the spatial-temporal variation law of meteorological data, check whether there is outlier in data sets by certain mathematical methods, whether to eliminate or correct them. +(f) Logical rationality check: According to meteorological knowledge, check whether there exist the data inconformity to conventional logic. For example, whether the lowest value is +greater than the highest value, or whether the average value is between the maximum value and the minimum value, and so on. +(g) Checking of other obvious errors. +3.3.2.1 Quality Control of Statistical Survey Data. Statistical survey data are generally recorded by administrative divisions, and the main contents of data quality control include: +(a) Quality control of administrative divisions’ codes: check whether the administrative divisions’ codes are within the scope of the national standard, and whether the correspondence between the administrative divisions’ codes and their name is correct. +(b) Quality control of time elements: check whether the attribute value and the format of time element for each record are correct. +(c) Missing value check: which fields should have values but are actually missing, the percentage of missing values, whether they can be interpolated by some means, and so on. +(d) Logical rationality check: according to the basic knowledge of statistics, check whether there exist the data inconformity to conventional logic. For example, in some administrative divisions, whether the total output of a certain crop is greater than the total grain output, whether the total crop output is equal to the planting area multiplied by the yield of a unit area, and whether the sum of the total grain output of the lower administrative divisions is equal to the total grain output of the higher administrative division, and so on. +(e) Checking of other obvious errors. +3.3.3 Quality Control of Spatial Data. Due to the inst ability of +spatial entities, the limitations of human cognitive expression, the observation errors of spatial entities, and the errors in spatial data processing, spatial data can cause quality problems when expressing the real world. According to its sources, the error of geographic information spatial data can be divided into the original data error and the error introduced by the spatial database construction. +3.3.3.1 Coordinate and Map Projection Checking. Spatial data +includes vector data and raster data. Whether it is vector data or raster data, it first need to be checked whether its coordinate system including ellipsoid parameters and map projection parameters are consistent with the corresponding parameters defined in the database. If not, conversion and modification are required to ensure overlay and spatial analysis between spatial data to be carried out. +3.3.3.2 Quality Control of Vector Elements. According to scale +and thematic content of data sets, it should be checked whether vector features (lines and polygons) conform to corresponding mapping specifications, for example normalization of lines and minimum spot on maps. The reference specification for the quality control is mapping specification at corresponding scale. +3.3.3.3 Quality Control of Raster Features. It should be checked +whether the size of grid cells is the same as that indicated in the +dataset name. +3.3.3.4 Quality Control of Attribute Elements in Spatial Data +Sets. For vector layer, the following contents should be checked: +(a) Code correctness checking: it should be checked whether attribute codes of vector elements (such as administrative divisions’ codes, land use type code, etc.) are beyond codes base, + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +76 +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +and whether the correspondence between codes and type names (such as administrative divisions’ names, names of land use type, etc.) is correct or not. +(b) Name/code missing checking: it should be checked whether there exist unnamed or uncoded vector features (points, lines or polygons). +(c) Checking of other attribute element values: it should be checked whether attribute values of vector features (such as temperature value in the isotherm) exceeds extreme limits. +(d) Obvious errors checking: it should be checked whether there are obvious errors in data sets by GIS software and visualization means. +For raster layers, the following contents should be checked: +(a) Code correctness checking: it should be checked whether attribute codes of grid cells arc within code database. +(b) Logical rationality checking: for example, whether NDVI values are between 0 and 1. +(c) Missing value checking: it should be checked whether there exist grid cells without attribute values, the ratio of the grid cells without attribute values to all cells, and whether the missing values can be interpolated by some methods. +(d) Outlier checking: such as cliff detection in DEM. +(e) Extreme values checking: it should be checked whether the attribute values of grid cells (such as temperature) exceeds the extreme limits. +(f) Obvious error checking: it can be visually checked whether there are obvious errors in raster layers by image processing system or GIS software. +4 Methods of Quality Control +Quality control methods of big data for early warning of agricultural meteorological disasters are divided into three types: automatic control methods, artificially interactive semi-automatic control methods and full manual control methods. +relatively low update frequency and low timeliness requirements. For example, detection of coordinate systems and projection parameters of spatial data, cartographic normative detection of vector features in digital maps, identification of grid cell size in raster data, detection of code normalization and logic consistency of attribute data in statistical survey data, etc. +4.3 Full Manual Control Methods +The data quality problems are detected and analyzed completely by manual visual method. Some obvious data quality problems may not be discovered through automated or semi-automated methods, but experienced technicians can easily identify them through manual visual methods, for example, obviously nonstandard drawings in digital maps or illogical values of grid cells. Checking of name normalization of data sets is also usually done by manual inspection methods. +5 Technological Process of Data Quality Control +Based on the above analysis, we can draw a flow chart for data quality control of Big data platform for agricultural meteorological disaster warning, as shown in Figure 2. +The data quality control process of Big data platform for agricultural meteorological disaster warning mainly includes:(1) data set name inspection, (2) data set content inspection. Quality control of data set content includes attribute data and spatial data. Attribute data are mainly used for meteorological observation data and statistical survey data. Spatial data are divided into vector data and raster data. Its quality control mainly checks the coordinate system and projection parameters, as well as the quality inspection of various spatial elements. + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +77 +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +4.1 Automatic Data Quality Control Methods +Instead of man-machine interaction, automatic data quality control methods realize data quality detection through computer software. The automatic methods are mainly aimed at real-time collected data with obvious characteristics of time series, such as real-time and quasi-real-time meteorological observation data. The quality inspection for real-time collected data needs not only high timeliness but also completing heavy workload. Only automated quality inspection can meet the needs of data quality control. +Quality problems of historical meteorological observation data, and some quantitative quality problems in vector data and raster data, can also be detected by automatic methods. +4.2 Semi-Automatic Quality Control Methods +With participation of professional technicians, the quality of data sets is interactively checked and judged through statistical analysis software or RS/GIS software. This situation is mainly for vector data, raster data, statistical survey data, etc., which have + +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. + +AICS 2019, July 12–13, 2019, Wuhan, Hubei, China Jiale Li et al. +Big data for +agricultural meteorological disasters warning +Datasets: 2D attribute table / Vector data layer /Raster data layer Names of data sets Contents of data sets +Normalization check for Quality control for contents of data sets names of data sets +Attribute data Spatial data +Whether Meteorological Vector Raster it observation Statistical data data +N contains 4 data survey data layer layer major +elements +Y Coordinate system and map +projection check +Normative Grid cell +detection of size vector features detection +Code correctness Station code Code correctness +Logical rationality Station coordinates Admin. codes Missing codes +Missing values Time elements Time elements Abnorm. inspection +Abnormal inspection Missing values Missing values Obvious errors +Extreme check Outliers Logical rationality detection +Obvious error Logical rationality …… …… +detection +…… +…… Semi-automatic Semi-automatic / +Semi-automatic / Automatic detection detection manual detection +manual +Is there a Y +quality +problem +N +End +Figure 2: Flow chart of data quality control for big data platform of agricultural meteorological disaster warning +6 Conclusions and Discussions +6.1 Conclusions +The framework, objects, contents and methods of data quality control for Big data platform of agricultural meteorological disasters warning were analyzed systematically in this study. The following conclusions were drawn: +(a) Data quality control is a basic work for construction of Big data platform of agricultural meteorological disasters warning, and it is also an important guarantee for success of early warning. In addition to the quality control of contents of data sets themselves, dataset names, metadata and data documents are also integral parts of data quality control for Big data platform of agricultural meteorological disaster warning. +This document was truncated here because it was created in the Evaluation Mode. +Evaluation Only. Created with Aspose.Words. Copyright 2003-2024 Aspose Pty Ltd. +78 diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 5dc80ac..8ff2878 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -4094,13 +4094,13 @@ "license": "MIT" }, "node_modules/@types/react": { - "version": "19.2.0", - "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.0.tgz", - "integrity": "sha512-1LOH8xovvsKsCBq1wnT4ntDUdCJKmnEakhsuoUSy6ExlHCkGP2hqnatagYTgFk6oeL0VU31u7SNjunPN+GchtA==", + "version": "19.2.14", + "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.14.tgz", + "integrity": "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==", "license": "MIT", "peer": true, "dependencies": { - "csstype": "^3.0.2" + "csstype": "^3.2.2" } }, "node_modules/@types/resolve": { @@ -6855,9 +6855,9 @@ "license": "MIT" }, "node_modules/csstype": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", - "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", "license": "MIT", "peer": true }, diff --git a/frontend/public/index.html b/frontend/public/index.html index 10594c3..42d31c2 100644 --- a/frontend/public/index.html +++ b/frontend/public/index.html @@ -7,7 +7,7 @@ - DataForgeTest + SmartDataTest diff --git a/frontend/src/App.js b/frontend/src/App.js index 489a22b..9ba1873 100644 --- a/frontend/src/App.js +++ b/frontend/src/App.js @@ -1,5 +1,5 @@ import React from 'react'; -import { BrowserRouter as Router, Routes, Route } from 'react-router-dom'; +import { BrowserRouter as Router, Routes, Route, useLocation } from 'react-router-dom'; // Importar o CSS do Tailwind primeiro import './index.css'; // Depois importar os estilos específicos da aplicação @@ -14,29 +14,39 @@ import ChecklistPage from './pages/ChecklistPage'; import GenerateDataset from './pages/GenerateDataset'; import AdvancedPySparkGenerator from './pages/AdvancedPySparkGenerator'; import MethodologyPage from './pages/MethodologyPage'; +import LoginPage from './pages/LoginPage'; import SupportButton from './components/SupportButton'; +import ProtectedRoute from './components/ProtectedRoute'; + +function AppContent() { + const location = useLocation(); + const hideSupportButton = + location.pathname === '/support-rag' || location.pathname === '/login'; + + return ( +
+ + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + + {!hideSupportButton && } +
+ ); +} function App() { return ( -
- - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - } /> - - {/* Add support button on all pages except support page */} - {window.location.pathname !== '/support-rag' && ( - - )} -
+
); } diff --git a/frontend/src/components/HomePage.js b/frontend/src/components/HomePage.js index 09cc5c3..b270eb6 100644 --- a/frontend/src/components/HomePage.js +++ b/frontend/src/components/HomePage.js @@ -1,16 +1,144 @@ import React, { useState } from 'react'; -import { Zap, Code, Bug, CheckCircle, AlertTriangle, FileText, GitCompare, Sparkles, Brain, TrendingUp, Shield, Clock, Globe, BarChart3, MessageSquare, Eye, GitBranch } from 'lucide-react'; +import { Zap, Code, Bug, CheckCircle, AlertTriangle, FileText, GitCompare, Sparkles, Brain, TrendingUp, Shield, Clock, Globe, BarChart3, MessageSquare, Eye, GitBranch, LogOut, Heart } from 'lucide-react'; import RAGButton from './RAGButton'; import DataAccuracyDropdown from './DataAccuracyDropdown'; import PySparkDropdown from './PySparkDropdown'; +import LanguageToggle from './LanguageToggle'; import { Link } from 'react-router-dom'; import { motion } from 'framer-motion'; import { fadeIn, staggerContainer, slideIn, scaleIn } from '../styles/animations'; +import { useAuthContext } from '../context/AuthContext'; +import { useLanguage } from '../context/LanguageContext'; +import useAuth from '../hooks/useAuth'; const DataQualityLLMSystem = () => { const [selectedStructure, setSelectedStructure] = useState('synthetic'); const [selectedFeature, setSelectedFeature] = useState('dataQuality'); + const { user } = useAuthContext(); + const { language } = useLanguage(); + const { handleLogout } = useAuth(); + + // --------------------------------------------------------------------------- + // Translations + // --------------------------------------------------------------------------- + const translations = { + 'pt-BR': { + navHome: 'Home', + navMethodology: 'Metodologia', + navChecklist: 'Checklist QA', + logout: 'Sair', + heroTitle: 'SmartDataTest\nTestes de Qualidade para Big Data', + heroSubtitle: 'Testes avançados de qualidade com métricas, suporte LLM + RAG e\ngeração automatizada de código PySpark', + btnChecklist: 'Checklist Support QA', + btnGenerate: 'Gerar Dataset', + btnMethodology: 'Metodologia', + sectionStructures: 'Estruturas de Dados', + sectionWorkflow: 'Fluxo de Trabalho LLM', + sectionProblems: 'Cenários de Qualidade de Dados', + sectionTips: 'Diretrizes de Implementação', + sectionFuture: 'Roadmap de Funcionalidades Futuras', + footerCopyright: '© 2026 SmartDataTest. Todos os direitos reservados.', + footerRights: 'Plataforma de Automação de Qualidade de Dados para Big Data com LLM + RAG.', + footerBuiltWith: 'Desenvolvido com', + footerTech: 'React · Python · PySpark · LLM · RAG', + }, + 'en-US': { + navHome: 'Home', + navMethodology: 'Methodology', + navChecklist: 'QA Checklist', + logout: 'Logout', + heroTitle: 'SmartDataTest\nBig Data Quality Testing', + heroSubtitle: 'Advanced data quality testing with metrics, LLM + RAG support, and\nautomated PySpark code generation', + btnChecklist: 'Checklist Support QA', + btnGenerate: 'Generate Dataset', + btnMethodology: 'Methodology', + sectionStructures: 'Data Structures', + sectionWorkflow: 'LLM Workflow', + sectionProblems: 'Data Quality Scenarios', + sectionTips: 'Implementation Guidelines', + sectionFuture: 'Future Features Roadmap', + footerCopyright: '© 2026 SmartDataTest. All rights reserved.', + footerRights: 'Data Quality Automation Platform for Big Data with LLM + RAG.', + footerBuiltWith: 'Built with', + footerTech: 'React · Python · PySpark · LLM · RAG', + }, + }; + const t = translations[language] ?? translations['en-US']; + + // --------------------------------------------------------------------------- + // HomeHeader — internal component + // --------------------------------------------------------------------------- + const HomeHeader = () => ( +
+
+ {/* Left — Logo */} + + + + SmartDataTest + + + + {/* Centre — Nav links (visible md+) */} + + + {/* Right — User area */} +
+ +
+
+ {user?.avatar || '?'} +
+ {user?.name} + +
+
+
+ ); + + // --------------------------------------------------------------------------- + // HomeFooter — internal component + // --------------------------------------------------------------------------- + const HomeFooter = () => ( +
+
+
+
+ + SmartDataTest + · + {t.footerCopyright} +
+
+ {t.footerBuiltWith} + + {t.footerTech} +
+ v1.0.0 · 2026 +
+

{t.footerRights}

+
+
+ ); + const structures = { synthetic: { title: 'SyntheticDataset', @@ -234,6 +362,9 @@ const DataQualityLLMSystem = () => { animate="animate" className="min-h-screen bg-gradient-to-br from-[#1a1a2e] via-[#16213e] to-[#1a1a2e] text-white overflow-x-hidden" > + {/* Header */} + + {/* Hero Section */} {
- DataForgeTest -
- Big Data Quality Testing + {t.heroTitle}
- Advanced data quality testing with metrics, LLM + RAG support, and -
- automated PySpark code generation + {t.heroSubtitle}
{ aria-label="Checklist Support QA" > - Checklist Support QA + {t.btnChecklist} { aria-label="Generate Synthetic Dataset" > - Generate Synthetic Dataset + {t.btnGenerate} { aria-label="Methodology Framework" > - Methodology + {t.btnMethodology} @@ -422,7 +549,7 @@ const DataQualityLLMSystem = () => { >

- System Workflow + {t.sectionWorkflow}

{ variants={fadeIn} className="mt-12 bg-gradient-to-r from-purple-900/50 to-pink-900/50 backdrop-blur-sm rounded-2xl p-8 border border-purple-700/50" > -

🎯 Data Quality Scenarios

+

🎯 {t.sectionProblems}

{ variants={fadeIn} className="mt-12 bg-gray-800/50 backdrop-blur-sm rounded-2xl p-8 border border-gray-700/50" > -

💡 Implementation Guidelines

+

💡 {t.sectionTips}

{ >

- Future Features Roadmap + {t.sectionFuture}

- Innovative features planned to enhance your DataForgeTest platform + Innovative features planned to enhance your SmartDataTest platform

{/* Feature Navigation */} @@ -701,7 +828,7 @@ const DataQualityLLMSystem = () => {
-
+ ); diff --git a/frontend/src/components/LanguageToggle.js b/frontend/src/components/LanguageToggle.js new file mode 100644 index 0000000..b270083 --- /dev/null +++ b/frontend/src/components/LanguageToggle.js @@ -0,0 +1,44 @@ +import React from 'react'; +import { Languages } from 'lucide-react'; +import { useLanguage } from '../context/LanguageContext'; + +/** + * LanguageToggle component. + * Visual identical to MethodologyPage.js toggle. + * + * @param {Object} props + * @param {'sm'|'md'} props.size - Button size variant. + */ +export default function LanguageToggle({ size = 'sm' }) { + const { language, changeLanguage } = useLanguage(); + + const btnClass = size === 'sm' + ? 'px-2 py-1 text-xs' + : 'px-4 py-2 text-sm'; + + return ( +
+ + +
+ ); +} diff --git a/frontend/src/components/ProtectedRoute.js b/frontend/src/components/ProtectedRoute.js new file mode 100644 index 0000000..c3defdc --- /dev/null +++ b/frontend/src/components/ProtectedRoute.js @@ -0,0 +1,42 @@ +import React from 'react'; +import { Navigate, useLocation } from 'react-router-dom'; +import { useAuthContext } from '../context/AuthContext'; +import { useLanguage } from '../context/LanguageContext'; + +function LoadingScreen() { + const { language } = useLanguage(); + const label = language === 'pt-BR' ? 'Carregando...' : 'Loading...'; + return ( +
+
+
+

{label}

+
+
+ ); +} + +/** + * ProtectedRoute — wraps routes that require authentication + profile. + */ +export default function ProtectedRoute({ children }) { + const { isAuthenticated, hasProfile, isLoading } = useAuthContext(); + const location = useLocation(); + + if (isLoading) { + return ; + } + + if (!isAuthenticated) { + return ; + } + + if (!hasProfile) { + return ; + } + + return <>{children}; +} diff --git a/frontend/src/context/AuthContext.js b/frontend/src/context/AuthContext.js new file mode 100644 index 0000000..0c4df69 --- /dev/null +++ b/frontend/src/context/AuthContext.js @@ -0,0 +1,69 @@ +import React, { createContext, useContext, useEffect, useState } from 'react'; +import { + clearSession, + getSession, + isAuthenticated as checkAuth, + hasProfile as checkProfile, + saveProfile, + saveSession, +} from '../utils/authStorage'; + +const AuthContext = createContext(null); + +export function AuthProvider({ children }) { + const [user, setUser] = useState(null); + const [isLoading, setIsLoading] = useState(true); + + // Restore session on mount + useEffect(() => { + const session = getSession(); + if (session) { + setUser(session); + } + setIsLoading(false); + }, []); + + const login = (userData, rememberMe = false) => { + saveSession(userData, rememberMe); + const session = getSession(); + setUser(session); + }; + + const logout = () => { + clearSession(); + setUser(null); + }; + + const saveUserProfile = (profileData) => { + saveProfile(profileData); + const session = getSession(); + setUser(session); + }; + + const isAuthenticated = checkAuth(); + const hasProfile = checkProfile(); + + return ( + + {children} + + ); +} + +export function useAuthContext() { + const ctx = useContext(AuthContext); + if (!ctx) { + throw new Error('useAuthContext must be used within an AuthProvider'); + } + return ctx; +} diff --git a/frontend/src/context/LanguageContext.js b/frontend/src/context/LanguageContext.js new file mode 100644 index 0000000..5d5bc1e --- /dev/null +++ b/frontend/src/context/LanguageContext.js @@ -0,0 +1,30 @@ +import React, { createContext, useContext, useState } from 'react'; + +const LANG_KEY = 'smartdatatest_language'; + +const LanguageContext = createContext(null); + +export function LanguageProvider({ children }) { + const [language, setLanguage] = useState( + () => localStorage.getItem(LANG_KEY) || 'pt-BR' + ); + + const changeLanguage = (lang) => { + setLanguage(lang); + localStorage.setItem(LANG_KEY, lang); + }; + + return ( + + {children} + + ); +} + +export function useLanguage() { + const ctx = useContext(LanguageContext); + if (!ctx) { + throw new Error('useLanguage must be used within a LanguageProvider'); + } + return ctx; +} diff --git a/frontend/src/data/users.js b/frontend/src/data/users.js new file mode 100644 index 0000000..4282dc5 --- /dev/null +++ b/frontend/src/data/users.js @@ -0,0 +1,41 @@ +/** + * Registered users — frontend data. + * + * ⚠️ TEMPORARY — no database. Migrate to API (/api/auth/validate) when backend auth is ready. + * Passwords are stored as bcrypt hashes (generated with werkzeug). + */ + +// Simple hash comparison — in production this would call the backend. +// These correspond to: admin123 / engineer123 / qa123456 +// We use plain bcrypt-compatible strings; password verification happens via +// a simple equality check in useAuth (frontend-only demo mode). +export const REGISTERED_USERS = [ + { + id: 'user-admin-001', + name: 'Admin DataForge', + email: 'admin@smartdatatest.com', + // Plain password stored only for frontend demo — migrate to backend auth + password: 'admin123', + role: 'admin', + avatar: null, + createdAt: '2026-01-01T00:00:00.000Z', + }, + { + id: 'user-eng-002', + name: 'Engineer DataForge', + email: 'engineer@smartdatatest.com', + password: 'engineer123', + role: 'data_eng', + avatar: null, + createdAt: '2026-01-01T00:00:00.000Z', + }, + { + id: 'user-qa-003', + name: 'QA DataForge', + email: 'qa@smartdatatest.com', + password: 'qa123456', + role: 'tester', + avatar: null, + createdAt: '2026-01-01T00:00:00.000Z', + }, +]; diff --git a/frontend/src/hooks/useAuth.js b/frontend/src/hooks/useAuth.js new file mode 100644 index 0000000..564b8da --- /dev/null +++ b/frontend/src/hooks/useAuth.js @@ -0,0 +1,69 @@ +import { useState } from 'react'; +import { useNavigate } from 'react-router-dom'; +import { REGISTERED_USERS } from '../data/users'; +import { useAuthContext } from '../context/AuthContext'; + +/** + * useAuth hook — handles login, logout, and profile saving. + * + * ⚠️ MIGRATION: replace REGISTERED_USERS lookup with: + * fetch(getApiUrl('/api/auth/validate'), { method:'POST', body: JSON.stringify({email, password}) }) + */ +export default function useAuth() { + const { login, logout, saveUserProfile } = useAuthContext(); + const navigate = useNavigate(); + const [error, setError] = useState(null); + const [isLoading, setIsLoading] = useState(false); + + const handleLogin = async (email, password, rememberMe = false) => { + setIsLoading(true); + setError(null); + + // Simulate network delay + await new Promise((resolve) => setTimeout(resolve, 1200)); + + const user = REGISTERED_USERS.find((u) => u.email === email); + if (!user) { + setError({ + 'pt-BR': 'Usuário não encontrado. Verifique o e-mail informado.', + 'en-US': 'User not found. Please check the email address.', + }); + setIsLoading(false); + return false; + } + + if (user.password !== password) { + setError({ + 'pt-BR': 'Senha incorreta. Tente novamente.', + 'en-US': 'Wrong password. Please try again.', + }); + setIsLoading(false); + return false; + } + + login(user, rememberMe); + setIsLoading(false); + return true; + }; + + const clearError = () => setError(null); + + const handleLogout = () => { + logout(); + navigate('/login'); + }; + + const handleSaveProfile = (data) => { + saveUserProfile(data); + navigate('/'); + }; + + return { + handleLogin, + handleLogout, + handleSaveProfile, + clearError, + error, + isLoading, + }; +} diff --git a/frontend/src/hooks/useStats.js b/frontend/src/hooks/useStats.js new file mode 100644 index 0000000..a4291b3 --- /dev/null +++ b/frontend/src/hooks/useStats.js @@ -0,0 +1,60 @@ +/** + * useStats — fetches live platform stats from GET /api/stats. + * + * Returns formatted strings ready for use in StatCard: + * tests → "971+" (total test count) + * datasets → "1180+" (files in storage/) + * coverage → "86%" (line coverage from cobertura XML) + * responseSla → "<2s" (SLA from performance benchmarks) + * + * Falls back to last-known values when the API is unreachable (e.g. dev offline). + */ + +import { useEffect, useState } from 'react'; +import { getApiUrl } from '../config/api'; + +// Last-known baselines used while loading or when the API fails +const FALLBACK = { + tests: '970+', + datasets: '1180+', + coverage: '86%', + responseSla: '<2s', +}; + +export default function useStats() { + const [stats, setStats] = useState(FALLBACK); + + useEffect(() => { + let cancelled = false; + + const fetchStats = async () => { + try { + const res = await fetch(getApiUrl('/api/stats'), { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + // Short timeout — login page must not stall for stats + signal: AbortSignal.timeout ? AbortSignal.timeout(4000) : undefined, + }); + if (!res.ok) return; + const data = await res.json(); + if (cancelled) return; + + setStats({ + tests: `${data.tests_total}+`, + datasets: `${data.datasets_total}+`, + coverage: `${data.coverage_pct}%`, + responseSla: data.response_sla_ms < 1000 + ? `<${data.response_sla_ms}ms` + : `<${data.response_sla_ms / 1000}s`, + }); + } catch { + // Network error or timeout — silently keep fallback values + } + }; + + fetchStats(); + return () => { cancelled = true; }; + }, []); + + return stats; +} diff --git a/frontend/src/index.js b/frontend/src/index.js index d563c0f..4f4871d 100644 --- a/frontend/src/index.js +++ b/frontend/src/index.js @@ -3,11 +3,17 @@ import ReactDOM from 'react-dom/client'; import './index.css'; import App from './App'; import reportWebVitals from './reportWebVitals'; +import { LanguageProvider } from './context/LanguageContext'; +import { AuthProvider } from './context/AuthContext'; const root = ReactDOM.createRoot(document.getElementById('root')); root.render( - + + + + + ); diff --git a/frontend/src/pages/AdvancedPySparkGenerator.js b/frontend/src/pages/AdvancedPySparkGenerator.js index 3ed3dd5..51c1727 100644 --- a/frontend/src/pages/AdvancedPySparkGenerator.js +++ b/frontend/src/pages/AdvancedPySparkGenerator.js @@ -116,7 +116,7 @@ const AdvancedPySparkGenerator = () => { }); if (!response.ok) { - let errorMessage = 'Failed to generate DSL'; + let errorMessage = 'Failed to generate JSON'; try { const errorData = await response.json(); errorMessage = errorData.error || errorMessage; @@ -153,7 +153,7 @@ const AdvancedPySparkGenerator = () => { try { finalDsl = JSON.parse(dslText); } catch (e) { - throw new Error('Invalid DSL JSON: ' + e.message); + throw new Error('Invalid JSON: ' + e.message); } } @@ -508,10 +508,10 @@ const AdvancedPySparkGenerator = () => { className="flex-1 px-6 py-3 bg-gradient-to-r from-purple-600 to-pink-600 text-white rounded-xl font-semibold disabled:opacity-50 disabled:cursor-not-allowed hover:shadow-lg hover:shadow-purple-500/30 transition-all duration-300 flex items-center justify-center gap-2" > {isLoading ? ( - <>Generating DSL... + <>Generating JSON... ) : ( <> - Generate DSL + Generate JSON )} @@ -525,12 +525,12 @@ const AdvancedPySparkGenerator = () => {

- Step 3: Review and Edit DSL + Step 3: Review and Edit JSON

- Review the generated Data Specification Language (DSL). You can edit it directly if needed. + Review the generated JSON. You can edit it directly if needed.

@@ -664,7 +664,7 @@ const AdvancedPySparkGenerator = () => {

{step === 1 && 'Upload'} {step === 2 && 'Review'} - {step === 3 && 'DSL'} + {step === 3 && 'JSON'} {step === 4 && 'Code'}

diff --git a/frontend/src/pages/DataAccuracy.js b/frontend/src/pages/DataAccuracy.js index ec7a21f..ed935e2 100644 --- a/frontend/src/pages/DataAccuracy.js +++ b/frontend/src/pages/DataAccuracy.js @@ -33,7 +33,7 @@ const DataAccuracy = () => { // Focus on page load useEffect(() => { - document.title = 'Acurácia de Dados - DataForgeTest'; + document.title = 'Acurácia de Dados - SmartDataTest'; }, []); const handleGoldFileSelect = async (file) => { diff --git a/frontend/src/pages/DatasetMetrics.js b/frontend/src/pages/DatasetMetrics.js index 72c9cd3..7db5166 100644 --- a/frontend/src/pages/DatasetMetrics.js +++ b/frontend/src/pages/DatasetMetrics.js @@ -25,7 +25,7 @@ const DatasetMetrics = () => { // Focus management useEffect(() => { - document.title = 'Dataset Metrics - DataForgeTest'; + document.title = 'Dataset Metrics - SmartDataTest'; }, []); // Handle file selection diff --git a/frontend/src/pages/LoginPage.js b/frontend/src/pages/LoginPage.js new file mode 100644 index 0000000..1cff489 --- /dev/null +++ b/frontend/src/pages/LoginPage.js @@ -0,0 +1,640 @@ +import React, { useEffect, useMemo, useState } from 'react'; +import { useNavigate, useLocation } from 'react-router-dom'; +import { motion, AnimatePresence } from 'framer-motion'; +import { + BarChart3, + BookOpen, + CheckCircle, + ChevronRight, + Clock, + Code, + Database, + Eye, + EyeOff, + GraduationCap, + Heart, + Loader, + Lock, + LogIn, + Mail, + Settings, + Shield, + TestTube, + User, + Zap, +} from 'lucide-react'; +import { useAuthContext } from '../context/AuthContext'; +import { useLanguage } from '../context/LanguageContext'; +import LanguageToggle from '../components/LanguageToggle'; +import useAuth from '../hooks/useAuth'; +import useStats from '../hooks/useStats'; +import { + floatingNode, + popIn, + profileCardIn, + scaleIn, + slideDown, + slideInFromLeft, + slideInFromRight, +} from '../styles/animations'; + +// --------------------------------------------------------------------------- +// Translations +// --------------------------------------------------------------------------- +const translations = { + 'pt-BR': { + platformName: 'SmartDataTest', + loginTitle: 'Bem-vindo de volta', + loginSubtitle: 'Faça login para acessar a plataforma de QA em Big Data', + emailLabel: 'E-mail', + emailPlaceholder: 'seu@email.com', + passwordLabel: 'Senha', + rememberMe: 'Lembrar-me por 7 dias', + loginButton: 'Entrar', + loginButtonLoading: 'Autenticando...', + demoCredentials: 'Credenciais de demonstração', + demoAdmin: 'Admin: admin@smartdatatest.com / admin123', + demoEngineer: 'Engenheiro: engineer@smartdatatest.com / engineer123', + demoQa: 'QA: qa@smartdatatest.com / qa123456', + profileTitle: 'Quase lá!', + profileSubtitle: 'Personalize sua experiência na plataforma', + profileQuestion: 'Qual é o seu perfil profissional?', + profileRoles: [ + { id: 'tester', label: 'QA / Tester', icon: 'TestTube', desc: 'Teste e validação de dados' }, + { id: 'data_eng', label: 'Engenheiro de Dados', icon: 'Database', desc: 'Pipelines e ETL' }, + { id: 'dev', label: 'Desenvolvedor', icon: 'Code', desc: 'Desenvolvimento de software' }, + { id: 'student', label: 'Estudante', icon: 'GraduationCap', desc: 'Aprendizado e pesquisa' }, + { id: 'teacher', label: 'Professor / Pesquisador', icon: 'BookOpen', desc: 'Ensino e academia' }, + { id: 'analyst', label: 'Analista de Dados', icon: 'BarChart3', desc: 'Análise e BI' }, + { id: 'devops', label: 'DevOps / SRE', icon: 'Settings', desc: 'Infraestrutura e CI/CD' }, + { id: 'other', label: 'Outra área', icon: 'User', desc: 'Outro perfil profissional' }, + ], + profileOtherPlaceholder: 'Descreva sua área de atuação...', + profileButton: 'Acessar plataforma', + profileSkip: 'Pular por agora', + rightPanelTitle: 'Pipeline de Qualidade', + rightPanelSubtitle: 'Monitoramento em tempo real', + statsLabels: { + tests: 'Testes', + datasets: 'Datasets', + coverage: 'Cobertura', + response: 'Resposta', + }, + footerCopyright: '© 2026 SmartDataTest. Todos os direitos reservados.', + footerRights: 'Plataforma de qualidade de dados com suporte de IA — Uso educacional e profissional.', + footerBuiltWith: 'Desenvolvido com', + footerTech: 'React + Flask + Python 3.12', + loading: 'Carregando...', + }, + 'en-US': { + platformName: 'SmartDataTest', + loginTitle: 'Welcome back', + loginSubtitle: 'Sign in to access the Big Data QA platform', + emailLabel: 'Email', + emailPlaceholder: 'your@email.com', + passwordLabel: 'Password', + rememberMe: 'Remember me for 7 days', + loginButton: 'Sign In', + loginButtonLoading: 'Authenticating...', + demoCredentials: 'Demo credentials', + demoAdmin: 'Admin: admin@smartdatatest.com / admin123', + demoEngineer: 'Engineer: engineer@smartdatatest.com / engineer123', + demoQa: 'QA: qa@smartdatatest.com / qa123456', + profileTitle: 'Almost there!', + profileSubtitle: 'Personalize your platform experience', + profileQuestion: 'What is your professional profile?', + profileRoles: [ + { id: 'tester', label: 'QA / Tester', icon: 'TestTube', desc: 'Data testing and validation' }, + { id: 'data_eng', label: 'Data Engineer', icon: 'Database', desc: 'Pipelines and ETL' }, + { id: 'dev', label: 'Developer', icon: 'Code', desc: 'Software development' }, + { id: 'student', label: 'Student', icon: 'GraduationCap', desc: 'Learning and research' }, + { id: 'teacher', label: 'Teacher / Researcher', icon: 'BookOpen', desc: 'Teaching and academia' }, + { id: 'analyst', label: 'Data Analyst', icon: 'BarChart3', desc: 'Analytics and BI' }, + { id: 'devops', label: 'DevOps / SRE', icon: 'Settings', desc: 'Infrastructure and CI/CD' }, + { id: 'other', label: 'Other', icon: 'User', desc: 'Other professional profile' }, + ], + profileOtherPlaceholder: 'Describe your area of work...', + profileButton: 'Access platform', + profileSkip: 'Skip for now', + rightPanelTitle: 'Quality Pipeline', + rightPanelSubtitle: 'Real-time monitoring', + statsLabels: { + tests: 'Tests', + datasets: 'Datasets', + coverage: 'Coverage', + response: 'Response', + }, + footerCopyright: '© 2026 SmartDataTest. All rights reserved.', + footerRights: 'AI-powered data quality platform — Educational and professional use.', + footerBuiltWith: 'Built with', + footerTech: 'React + Flask + Python 3.12', + loading: 'Loading...', + }, +}; + +// --------------------------------------------------------------------------- +// Icon map for role cards +// --------------------------------------------------------------------------- +const ROLE_ICONS = { + TestTube, + Database, + Code, + GraduationCap, + BookOpen, + BarChart3, + Settings, + User, +}; + +// --------------------------------------------------------------------------- +// AnimatedBackground +// --------------------------------------------------------------------------- +const BG_LABELS = [ + 'Parquet', 'PySpark', 'Delta Lake', 'pytest', 'JSON', 'CSV', + 'LLM', 'RAG', 'ETL', 'SQL', 'HDFS', 'Kafka', 'Airflow', 'dbt', + 'BigQuery', 'Spark', 'Schema', 'NULL Check', 'Assertion', 'Coverage', + 'PEP-8', 'pytest-cov', 'Locust', 'Pandas', 'dbt', +]; + +function AnimatedBackground() { + const nodes = useMemo( + () => + BG_LABELS.map((label, i) => ({ + label, + size: 60 + Math.floor(((i * 37) % 61)), + top: `${5 + ((i * 17) % 85)}%`, + left: `${3 + ((i * 23) % 91)}%`, + opacity: 0.1 + ((i % 5) * 0.04), + duration: 10 + (i % 8) * 2, + delay: (i % 6) * 0.5, + })), + [] + ); + + return ( + + ); +} + +// --------------------------------------------------------------------------- +// TopBar +// --------------------------------------------------------------------------- +function TopBar() { + return ( +
+
+
+ + + + SmartDataTest + + +
+ +
+
+ ); +} + +// --------------------------------------------------------------------------- +// Right Panel — Pipeline steps +// --------------------------------------------------------------------------- +const PIPELINE_STEPS = [ + { name: 'Data Ingestion', status: 'done', progress: 100 }, + { name: 'Schema Validation', status: 'done', progress: 100 }, + { name: 'Quality Checks', status: 'running', progress: 72 }, + { name: 'Gold Generation', status: 'pending', progress: 0 }, + { name: 'Report Export', status: 'pending', progress: 0 }, +]; + +function StatCard({ icon: Icon, label, value, color }) { + const [count, setCount] = useState(0); + useEffect(() => { + const target = parseInt(value.replace(/\D/g, ''), 10) || 0; + if (target === 0) return; + let current = 0; + const step = Math.ceil(target / 60); + const timer = setInterval(() => { + current = Math.min(current + step, target); + setCount(current); + if (current >= target) clearInterval(timer); + }, 20); + return () => clearInterval(timer); + }, [value]); + + const displayValue = value.includes('%') + ? `${count}%` + : value.includes('<') + ? value + : `${count}+`; + + return ( + + + {displayValue} + {label} + + ); +} + +function RightPanel({ t }) { + const { tests, datasets, coverage, responseSla } = useStats(); + + return ( +
+ {/* Section A — Pipeline */} +
+
+

{t.rightPanelTitle}

+

{t.rightPanelSubtitle}

+
+ + {PIPELINE_STEPS.map((step) => ( + +
+ {step.status === 'done' && } + {step.status === 'running' && } + {step.status === 'pending' && } +
+
+
+ {step.name} + {step.progress}% +
+
+ {step.status === 'running' ? ( + + ) : ( +
+ )} +
+
+ + ))} + +
+ + {/* Section B — Stats (flex-1 to fill remaining space) */} + + + + + + +
+ ); +} + +// --------------------------------------------------------------------------- +// Footer +// --------------------------------------------------------------------------- +function LoginFooter({ t }) { + return ( +
+
+
+
+ + {t.platformName} + · + {t.footerCopyright} +
+
+ {t.footerBuiltWith} + + {t.footerTech} +
+ v1.0.0 · 2026 +
+

{t.footerRights}

+
+
+ ); +} + +// --------------------------------------------------------------------------- +// LoginPage +// --------------------------------------------------------------------------- +export default function LoginPage() { + const { isAuthenticated, hasProfile } = useAuthContext(); + const { language } = useLanguage(); + const { handleLogin, handleSaveProfile, clearError, error, isLoading } = useAuth(); + const navigate = useNavigate(); + const location = useLocation(); + + const [step, setStep] = useState( + location.state?.step === 'profile' ? 'profile' : 'login' + ); + const [email, setEmail] = useState(''); + const [password, setPassword] = useState(''); + const [showPassword, setShowPassword] = useState(false); + const [rememberMe, setRememberMe] = useState(false); + const [selectedRole, setSelectedRole] = useState(''); + const [customRole, setCustomRole] = useState(''); + + const t = translations[language]; + + useEffect(() => { + if (isAuthenticated && hasProfile) { + navigate(location.state?.from?.pathname || '/'); + } + if (isAuthenticated && !hasProfile) { + setStep('profile'); + } + }, [isAuthenticated, hasProfile, navigate, location.state]); + + const onSubmitLogin = async (e) => { + e.preventDefault(); + const ok = await handleLogin(email, password, rememberMe); + if (ok) setStep('profile'); + }; + + const onSubmitProfile = (e) => { + e.preventDefault(); + const role = selectedRole === 'other' ? customRole.trim() : selectedRole; + if (!role) return; + handleSaveProfile({ role, setAt: new Date().toISOString() }); + }; + + const onSkipProfile = () => { + handleSaveProfile({ role: 'unset', setAt: new Date().toISOString() }); + }; + + return ( +
+ + + +
+ {/* Left panel */} +
+ + {step === 'login' ? ( + + {/* Header */} +
+
+ +
+
+

{t.loginTitle}

+

{t.loginSubtitle}

+
+
+ +
+ {/* Email */} +
+ +
+ + setEmail(e.target.value)} + placeholder={t.emailPlaceholder} + required + className="w-full bg-gray-800/60 border border-gray-700/50 rounded-lg pl-10 pr-4 py-2.5 text-sm text-white placeholder-gray-500 focus:outline-none focus:ring-2 focus:ring-purple-500/50 focus:border-purple-500/50" + /> +
+
+ + {/* Password */} +
+ +
+ + setPassword(e.target.value)} + required + className="w-full bg-gray-800/60 border border-gray-700/50 rounded-lg pl-10 pr-10 py-2.5 text-sm text-white placeholder-gray-500 focus:outline-none focus:ring-2 focus:ring-purple-500/50 focus:border-purple-500/50" + /> + +
+
+ + {/* Remember me */} + + + {/* Error */} + + {error && ( + + {error[language]} + + )} + + + {/* Submit */} + +
+ + {/* Demo credentials */} +
+ + + {t.demoCredentials} + +
+ {t.demoAdmin} + {t.demoEngineer} + {t.demoQa} +
+
+
+ ) : ( + + {/* Header */} +
+ + + +
+

{t.profileTitle}

+

{t.profileSubtitle}

+
+
+ +

{t.profileQuestion}

+ +
+ {/* Role cards grid */} +
+ {t.profileRoles.map((role) => { + const Icon = ROLE_ICONS[role.icon] || User; + const isSelected = selectedRole === role.id; + return ( + { + setSelectedRole(role.id); + clearError(); + }} + className={`flex flex-col items-start gap-1 p-3 rounded-xl border text-left transition-all ${ + isSelected + ? 'border-purple-500 bg-purple-900/30' + : 'border-gray-700/50 bg-gray-800/30 hover:border-gray-600' + }`} + > +
+ + {isSelected && } +
+ {role.label} + {role.desc} +
+ ); + })} +
+ + {/* Other role textarea */} + + {selectedRole === 'other' && ( + + , + }, + AnimatePresence: ({ children }) => <>{children}, +})); + +jest.mock('lucide-react', () => ({ + Database: () => DB, + Mail: () => Mail, + Lock: () => Lock, + Eye: () => Eye, + EyeOff: () => EyeOff, + LogIn: () => LogIn, + ChevronRight: () => Chevron, + CheckCircle: () => Check, + Languages: () => Languages, + Shield: () => Shield, + Heart: () => Heart, + TestTube: () => TestTube, + BarChart3: () => BarChart, + Code: () => Code, + GraduationCap: () => GradCap, + BookOpen: () => BookOpen, + Settings: () => Settings, + User: () => User, + Loader: () => Loader, + Clock: () => Clock, + Zap: () => Zap, +})); + +jest.mock('../../../frontend/src/styles/animations', () => ({ + fadeIn: {}, + slideIn: {}, + staggerContainer: {}, + slideInFromLeft: {}, + slideInFromRight: {}, + slideDown: {}, + popIn: {}, + profileCardIn: {}, + floatingNode: () => ({ animate: {} }), + scaleIn: {}, +})); + +jest.mock('../../../frontend/src/components/LanguageToggle', () => + function MockLanguageToggle() { + return
LangToggle
; + } +); + +import LoginPage from '../../../frontend/src/pages/LoginPage'; + +const renderPage = () => + render( + + + + ); + +// ─── helpers ───────────────────────────────────────────────────────────────── + +const setProfileStep = () => + mockUseLocation.mockReturnValue({ pathname: '/login', state: { step: 'profile' } }); + +const setLoginStep = () => + mockUseLocation.mockReturnValue({ pathname: '/login', state: null }); + +// ─── Suites ────────────────────────────────────────────────────────────────── + +describe('LoginPage — Profile step rendering', () => { + beforeEach(() => { + jest.clearAllMocks(); + setProfileStep(); + mockUseAuthContext.mockReturnValue({ + isAuthenticated: false, + hasProfile: false, + isLoading: false, + user: null, + }); + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: false, + }); + }); + + test('renders profile title (Quase lá!) when step is profile', () => { + renderPage(); + expect(screen.getByText(/Quase lá/i)).toBeInTheDocument(); + }); + + test('renders all 8 role cards', () => { + renderPage(); + const roles = ['tester', 'data_eng', 'dev', 'student', 'teacher', 'analyst', 'devops', 'other']; + roles.forEach((id) => { + expect(document.querySelector(`[data-testid="role-card-${id}"]`)).toBeTruthy(); + }); + }); + + test('clicking a role card selects it and calls clearError', () => { + renderPage(); + const testerCard = document.querySelector('[data-testid="role-card-tester"]'); + expect(testerCard).toBeTruthy(); + fireEvent.click(testerCard); + expect(mockClearError).toHaveBeenCalled(); + }); + + test('submit button is disabled when no role is selected', () => { + renderPage(); + const submitBtn = document.querySelector('form button[type="submit"]'); + expect(submitBtn).toBeDisabled(); + }); + + test('submit button becomes enabled after selecting a role', () => { + renderPage(); + fireEvent.click(document.querySelector('[data-testid="role-card-tester"]')); + const submitBtn = document.querySelector('form button[type="submit"]'); + expect(submitBtn).not.toBeDisabled(); + }); + + test('submitting with a selected role calls handleSaveProfile', () => { + renderPage(); + fireEvent.click(document.querySelector('[data-testid="role-card-data_eng"]')); + const form = document.querySelector('form'); + fireEvent.submit(form); + expect(mockHandleSaveProfile).toHaveBeenCalledWith( + expect.objectContaining({ role: 'data_eng' }) + ); + }); + + test('submitting with no role selected does NOT call handleSaveProfile', () => { + renderPage(); + const form = document.querySelector('form'); + fireEvent.submit(form); + expect(mockHandleSaveProfile).not.toHaveBeenCalled(); + }); + + test('selecting "other" role reveals textarea', () => { + renderPage(); + fireEvent.click(document.querySelector('[data-testid="role-card-other"]')); + const textarea = document.querySelector('textarea'); + expect(textarea).toBeTruthy(); + }); + + test('submit with "other" role uses customRole text', () => { + renderPage(); + fireEvent.click(document.querySelector('[data-testid="role-card-other"]')); + const textarea = document.querySelector('textarea'); + fireEvent.change(textarea, { target: { value: 'Data Scientist' } }); + const form = document.querySelector('form'); + fireEvent.submit(form); + expect(mockHandleSaveProfile).toHaveBeenCalledWith( + expect.objectContaining({ role: 'Data Scientist' }) + ); + }); + + test('submit is disabled when "other" selected but textarea is empty', () => { + renderPage(); + fireEvent.click(document.querySelector('[data-testid="role-card-other"]')); + const submitBtn = document.querySelector('form button[type="submit"]'); + expect(submitBtn).toBeDisabled(); + }); + + test('clicking skip button calls handleSaveProfile with role="unset"', () => { + renderPage(); + const skipBtn = screen.getByText(/Pular por agora/i); + fireEvent.click(skipBtn); + expect(mockHandleSaveProfile).toHaveBeenCalledWith( + expect.objectContaining({ role: 'unset' }) + ); + }); +}); + +describe('LoginPage — Auth redirect effects', () => { + beforeEach(() => { + jest.clearAllMocks(); + setLoginStep(); + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: false, + }); + }); + + test('navigates to "/" when isAuthenticated=true and hasProfile=true', async () => { + mockUseAuthContext.mockReturnValue({ + isAuthenticated: true, + hasProfile: true, + isLoading: false, + user: { email: 'test@example.com' }, + }); + renderPage(); + await waitFor(() => { + expect(mockNavigate).toHaveBeenCalledWith('/'); + }); + }); + + test('navigates to from.pathname when location.state.from exists', async () => { + mockUseLocation.mockReturnValue({ + pathname: '/login', + state: { from: { pathname: '/checklist' } }, + }); + mockUseAuthContext.mockReturnValue({ + isAuthenticated: true, + hasProfile: true, + isLoading: false, + user: null, + }); + renderPage(); + await waitFor(() => { + expect(mockNavigate).toHaveBeenCalledWith('/checklist'); + }); + }); + + test('switches to profile step when isAuthenticated=true but hasProfile=false', async () => { + mockUseAuthContext.mockReturnValue({ + isAuthenticated: true, + hasProfile: false, + isLoading: false, + user: null, + }); + renderPage(); + await waitFor(() => { + expect(screen.getByText(/Quase lá/i)).toBeInTheDocument(); + }); + }); +}); + +describe('LoginPage — Login form loading and error states', () => { + beforeEach(() => { + jest.clearAllMocks(); + setLoginStep(); + mockUseAuthContext.mockReturnValue({ + isAuthenticated: false, + hasProfile: false, + isLoading: false, + user: null, + }); + }); + + test('shows loading text on submit button when isLoading=true', () => { + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: true, + }); + renderPage(); + expect(screen.getByText(/Autenticando/i)).toBeInTheDocument(); + // Multiple Loader icons may exist (submit button + pipeline step); assert at least one + expect(screen.getAllByTestId('icon-loader').length).toBeGreaterThanOrEqual(1); + }); + + test('shows bilingual error message when error is present', () => { + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: { 'pt-BR': 'Usuário não encontrado.', 'en-US': 'User not found.' }, + isLoading: false, + }); + renderPage(); + expect(screen.getByText(/Usuário não encontrado/i)).toBeInTheDocument(); + }); + + test('successful login transitions to profile step', async () => { + mockHandleLogin.mockResolvedValue(true); + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: false, + }); + renderPage(); + const emailInput = document.querySelector('input[type="email"]'); + const passwordInput = document.querySelector('input[type="password"]'); + fireEvent.change(emailInput, { target: { value: 'admin@smartdatatest.com' } }); + fireEvent.change(passwordInput, { target: { value: 'admin123' } }); + const form = document.querySelector('form'); + fireEvent.submit(form); + await waitFor(() => { + expect(mockHandleLogin).toHaveBeenCalledWith('admin@smartdatatest.com', 'admin123', false); + }); + await waitFor(() => { + expect(screen.getByText(/Quase lá/i)).toBeInTheDocument(); + }); + }); +}); + +describe('LoginPage — RightPanel live feed timer', () => { + beforeEach(() => { + jest.clearAllMocks(); + setLoginStep(); + mockUseAuthContext.mockReturnValue({ + isAuthenticated: false, + hasProfile: false, + isLoading: false, + user: null, + }); + mockUseAuth.mockReturnValue({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: false, + }); + }); + + test('timer interval fires without crashing after 3100ms', () => { + jest.useFakeTimers(); + renderPage(); + act(() => { + jest.advanceTimersByTime(3100); + }); + // Verify the page still renders correctly after timer fires + expect(document.querySelector('[data-testid="animated-bg"]')).toBeInTheDocument(); + jest.useRealTimers(); + }); + + test('timer clears on unmount (no memory-leak warnings)', () => { + jest.useFakeTimers(); + const { unmount } = renderPage(); + act(() => { + jest.advanceTimersByTime(3100); + }); + expect(() => unmount()).not.toThrow(); + jest.useRealTimers(); + }); +}); diff --git a/tests/frontend/unit/LoginPage.test.js b/tests/frontend/unit/LoginPage.test.js new file mode 100644 index 0000000..d9e39cb --- /dev/null +++ b/tests/frontend/unit/LoginPage.test.js @@ -0,0 +1,225 @@ +/** + * Tests for frontend/src/pages/LoginPage.js + */ + +import React from 'react'; +import { render, screen, fireEvent, waitFor } from '@testing-library/react'; +import { BrowserRouter } from 'react-router-dom'; +import '@testing-library/jest-dom'; + +// Mock react-router-dom +const mockNavigate = jest.fn(); +jest.mock('react-router-dom', () => ({ + BrowserRouter: ({ children }) =>
{children}
, + useNavigate: () => mockNavigate, + useLocation: () => ({ pathname: '/login', state: null }), + Navigate: () => null, + Route: ({ element }) => element, + Routes: ({ children }) =>
{children}
, +})); + +// Mock useAuth hook +const mockHandleLogin = jest.fn(); +const mockHandleSaveProfile = jest.fn(); +const mockHandleLogout = jest.fn(); +const mockClearError = jest.fn(); + +jest.mock('../../../frontend/src/hooks/useAuth', () => () => ({ + handleLogin: mockHandleLogin, + handleLogout: mockHandleLogout, + handleSaveProfile: mockHandleSaveProfile, + clearError: mockClearError, + error: null, + isLoading: false, +})); + +jest.mock('../../../frontend/src/hooks/useStats', () => () => ({ + tests: '970+', + datasets: '1180+', + coverage: '86%', + responseSla: '<2s', +})); + +// Mock useLanguage +const mockChangeLanguage = jest.fn(); +jest.mock('../../../frontend/src/context/LanguageContext', () => ({ + useLanguage: () => ({ + language: 'pt-BR', + changeLanguage: mockChangeLanguage, + }), +})); + +// Mock AuthContext +jest.mock('../../../frontend/src/context/AuthContext', () => ({ + useAuthContext: () => ({ + isAuthenticated: false, + hasProfile: false, + isLoading: false, + user: null, + }), +})); + +// Mock framer-motion +jest.mock('framer-motion', () => ({ + motion: { + div: ({ children, ...props }) =>
{children}
, + form: ({ children, ...props }) => {children}, + p: ({ children, ...props }) =>

{children}

, + button: ({ children, ...props }) => , + span: ({ children, ...props }) => {children}, + h2: ({ children, ...props }) =>

{children}

, + textarea: ({ children, ...props }) => , + }, + AnimatePresence: ({ children }) => <>{children}, +})); + +// Mock lucide-react +jest.mock('lucide-react', () => ({ + Database: () => DB, + Mail: () => Mail, + Lock: () => Lock, + Eye: () => Eye, + EyeOff: () => EyeOff, + LogIn: () => LogIn, + ChevronRight: () => Chevron, + CheckCircle: () => Check, + Languages: () => Languages, + Shield: () => Shield, + Heart: () => Heart, + TestTube: () => TestTube, + BarChart3: () => BarChart, + Code: () => Code, + GraduationCap: () => GradCap, + BookOpen: () => Book, + Settings: () => Settings, + User: () => User, + Loader: () => Loader, + Clock: () => Clock, + Zap: () => Zap, +})); + +// Mock animations +jest.mock('../../../frontend/src/styles/animations', () => ({ + fadeIn: {}, + slideIn: {}, + staggerContainer: {}, + slideInFromLeft: {}, + slideInFromRight: {}, + slideDown: {}, + popIn: {}, + profileCardIn: {}, + floatingNode: () => ({ animate: {} }), + scaleIn: {}, +})); + +// Mock LanguageToggle +jest.mock('../../../frontend/src/components/LanguageToggle', () => + function MockLanguageToggle() { + return ( +
+ + +
+ ); + } +); + +import LoginPage from '../../../frontend/src/pages/LoginPage'; + +const renderLoginPage = () => + render(); + +describe('LoginPage — Step 1: Login Form', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + test('renders login title in PT-BR by default', () => { + renderLoginPage(); + const elements = screen.getAllByText(/SmartDataTest/i); + expect(elements.length).toBeGreaterThan(0); + }); + + test('email and password fields exist', () => { + renderLoginPage(); + expect(document.querySelector('input[type="email"]')).toBeTruthy(); + expect(document.querySelector('input[type="password"]')).toBeTruthy(); + }); + + test('password visibility toggle works', () => { + renderLoginPage(); + const passwordInput = document.querySelector('input[type="password"]'); + expect(passwordInput).toBeTruthy(); + // Eye icon should be present (password is hidden) + expect(screen.getByTestId('icon-eye')).toBeInTheDocument(); + // Click toggle + const eyeIcon = screen.getByTestId('icon-eye'); + fireEvent.click(eyeIcon.closest('button')); + // After click, EyeOff should appear + expect(screen.getByTestId('icon-eyeoff')).toBeInTheDocument(); + }); + + test('rememberMe checkbox is interactive', () => { + renderLoginPage(); + const checkbox = document.querySelector('input[type="checkbox"]'); + expect(checkbox).toBeTruthy(); + fireEvent.click(checkbox); + expect(checkbox.checked).toBe(true); + }); + + test('renders animated background nodes with data-testid', () => { + renderLoginPage(); + expect(document.querySelector('[data-testid="animated-bg"]')).toBeInTheDocument(); + }); + + test('footer with copyright renders in PT-BR', () => { + renderLoginPage(); + const elements = screen.getAllByText(/2026/i); + expect(elements.length).toBeGreaterThan(0); + }); + + test('demo credentials section is expandable', () => { + renderLoginPage(); + const detailsEl = document.querySelector('details'); + expect(detailsEl).toBeTruthy(); + }); +}); + +describe('LoginPage — Login Form submission', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + test('calls handleLogin on form submit', async () => { + mockHandleLogin.mockResolvedValue(false); + renderLoginPage(); + const emailInput = document.querySelector('input[type="email"]'); + const passwordInput = document.querySelector('input[type="password"]'); + if (emailInput && passwordInput) { + fireEvent.change(emailInput, { target: { value: 'test@example.com' } }); + fireEvent.change(passwordInput, { target: { value: 'password' } }); + const form = document.querySelector('form'); + if (form) fireEvent.submit(form); + await waitFor(() => { + expect(mockHandleLogin).toHaveBeenCalled(); + }); + } + }); +}); + +describe('LoginPage — Error display', () => { + test('footer with copyright present', () => { + renderLoginPage(); + const elements = screen.getAllByText(/2026/i); + expect(elements.length).toBeGreaterThan(0); + }); +}); + +describe('LoginPage — Right Panel', () => { + test('right panel does not contain live detection feed', () => { + renderLoginPage(); + // "Detecções" / "Detections" should not appear — feed was removed + expect(screen.queryByText(/Detecções/i)).not.toBeInTheDocument(); + expect(screen.queryByText(/Detections/i)).not.toBeInTheDocument(); + }); +}); diff --git a/tests/frontend/unit/MethodologyPage.test.js b/tests/frontend/unit/MethodologyPage.test.js index b9b584a..ab923a5 100644 --- a/tests/frontend/unit/MethodologyPage.test.js +++ b/tests/frontend/unit/MethodologyPage.test.js @@ -41,6 +41,11 @@ jest.mock('../../../frontend/src/styles/animations', () => ({ staggerContainer: {}, })); +// Mock LanguageContext — MethodologyPage now uses useLanguage() globally +jest.mock('../../../frontend/src/context/LanguageContext', () => ({ + useLanguage: () => ({ language: 'pt-BR', changeLanguage: jest.fn() }), +})); + describe('MethodologyPage', () => { const renderWithRouter = (component) => { return render( diff --git a/tests/frontend/unit/ProtectedRoute.test.js b/tests/frontend/unit/ProtectedRoute.test.js new file mode 100644 index 0000000..33d398d --- /dev/null +++ b/tests/frontend/unit/ProtectedRoute.test.js @@ -0,0 +1,96 @@ +/** + * Tests for frontend/src/components/ProtectedRoute.js + */ + +import React from 'react'; +import { render, screen } from '@testing-library/react'; +import { BrowserRouter } from 'react-router-dom'; +import '@testing-library/jest-dom'; + +const mockUseAuthContext = jest.fn(); +jest.mock('../../../frontend/src/context/AuthContext', () => ({ + useAuthContext: () => mockUseAuthContext(), +})); + +const mockUseLanguage = jest.fn(() => ({ language: 'pt-BR', changeLanguage: jest.fn() })); +jest.mock('../../../frontend/src/context/LanguageContext', () => ({ + useLanguage: () => mockUseLanguage(), +})); + +// Mock Navigate to inspect redirect calls without actual navigation +const mockNavigateFn = jest.fn(() => null); +jest.mock('react-router-dom', () => ({ + BrowserRouter: ({ children }) =>
{children}
, + Navigate: (props) => { + mockNavigateFn(props); + return
; + }, + useLocation: () => ({ pathname: '/dashboard', state: null }), + useNavigate: () => jest.fn(), + Route: ({ element }) => element, + Routes: ({ children }) =>
{children}
, +})); + +import ProtectedRoute from '../../../frontend/src/components/ProtectedRoute'; + +const ChildComponent = () =>
Protected Content
; + +const renderProtectedRoute = (authState) => { + mockUseAuthContext.mockReturnValue(authState); + return render( + + + + + + ); +}; + +describe('ProtectedRoute', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockUseLanguage.mockReturnValue({ language: 'pt-BR', changeLanguage: jest.fn() }); + }); + + test('redirects to /login when not authenticated', () => { + renderProtectedRoute({ isAuthenticated: false, hasProfile: false, isLoading: false }); + expect(screen.queryByTestId('protected-content')).not.toBeInTheDocument(); + expect(screen.getByTestId('navigate')).toBeInTheDocument(); + expect(screen.getByTestId('navigate').getAttribute('data-to')).toBe('/login'); + }); + + test('renders children when authenticated with profile', () => { + renderProtectedRoute({ isAuthenticated: true, hasProfile: true, isLoading: false }); + expect(screen.getByTestId('protected-content')).toBeInTheDocument(); + }); + + test('redirects to /login when authenticated but without profile', () => { + renderProtectedRoute({ isAuthenticated: true, hasProfile: false, isLoading: false }); + expect(screen.queryByTestId('protected-content')).not.toBeInTheDocument(); + expect(screen.getByTestId('navigate')).toBeInTheDocument(); + expect(screen.getByTestId('navigate').getAttribute('data-to')).toBe('/login'); + }); + + test('shows LoadingScreen during isLoading=true', () => { + renderProtectedRoute({ isAuthenticated: false, hasProfile: false, isLoading: true }); + expect(screen.queryByTestId('protected-content')).not.toBeInTheDocument(); + expect(screen.getByTestId('loading-screen')).toBeInTheDocument(); + }); + + test('preserves original route in location.state.from when redirecting', () => { + renderProtectedRoute({ isAuthenticated: false, hasProfile: false, isLoading: false }); + expect(mockNavigateFn).toHaveBeenCalledWith( + expect.objectContaining({ + to: '/login', + state: expect.objectContaining({ from: expect.any(Object) }), + }) + ); + }); + + test('LoadingScreen shows "Loading..." label when language is en-US', () => { + mockUseLanguage.mockReturnValue({ language: 'en-US', changeLanguage: jest.fn() }); + renderProtectedRoute({ isAuthenticated: false, hasProfile: false, isLoading: true }); + expect(screen.getByTestId('loading-screen')).toBeInTheDocument(); + expect(screen.getByText('Loading...')).toBeInTheDocument(); + }); +}); diff --git a/tests/frontend/unit/QaChecklist.test.js b/tests/frontend/unit/QaChecklist.test.js index a437450..8f2a387 100644 --- a/tests/frontend/unit/QaChecklist.test.js +++ b/tests/frontend/unit/QaChecklist.test.js @@ -169,15 +169,15 @@ describe('QaChecklist Component', () => { // Fill and submit fireEvent.change(textarea, { target: { value: 'start_date:<:end_date' } }); - const submitButton = screen.getByRole('button', { name: /Gerar DSL e PySpark/i }); + const submitButton = screen.getByRole('button', { name: /Gerar JSON e PySpark/i }); fireEvent.click(submitButton); // Should show success message and results await waitFor(() => { - expect(screen.getByText(/DSL e código PySpark gerados com sucesso/)).toBeInTheDocument(); + expect(screen.getByText(/JSON e código PySpark gerados com sucesso/)).toBeInTheDocument(); }); - expect(screen.getByText('DSL (Domain Specific Language)')).toBeInTheDocument(); + expect(screen.getByText('JSON')).toBeInTheDocument(); expect(screen.getByText('Código PySpark')).toBeInTheDocument(); }); @@ -240,12 +240,12 @@ describe('QaChecklist Component', () => { // Submit on last question fireEvent.change(textarea, { target: { value: 'start_date:<:end_date' } }); - const submitButton = screen.getByRole('button', { name: /Gerar DSL e PySpark/i }); + const submitButton = screen.getByRole('button', { name: /Gerar JSON e PySpark/i }); fireEvent.click(submitButton); // Should show error await waitFor(() => { - expect(screen.getByText(/Failed to generate DSL and PySpark code/)).toBeInTheDocument(); + expect(screen.getByText(/Failed to generate JSON and PySpark code/)).toBeInTheDocument(); }); }); diff --git a/tests/frontend/unit/SupportPage.test.js b/tests/frontend/unit/SupportPage.test.js index 00e1a56..5f52273 100644 --- a/tests/frontend/unit/SupportPage.test.js +++ b/tests/frontend/unit/SupportPage.test.js @@ -37,7 +37,7 @@ describe('SupportPage Integration Tests', () => { test('renders SupportPage with title and description', () => { render(); - expect(screen.getByText(/DataForgeTest Support/i)).toBeInTheDocument(); + expect(screen.getByText(/SmartDataTest Support/i)).toBeInTheDocument(); expect(screen.getByText(/Get help with your data quality testing setup using our AI-powered documentation assistant/i)).toBeInTheDocument(); }); @@ -77,6 +77,6 @@ describe('SupportPage Integration Tests', () => { expect(screen.getByTestId('message-circle-icon')).toBeInTheDocument(); // Check title is present - expect(screen.getByText(/DataForgeTest Support/i)).toBeInTheDocument(); + expect(screen.getByText(/SmartDataTest Support/i)).toBeInTheDocument(); }); }); diff --git a/tests/frontend/unit/TestDatasetGold.test.js b/tests/frontend/unit/TestDatasetGold.test.js index 781a53c..a64e891 100644 --- a/tests/frontend/unit/TestDatasetGold.test.js +++ b/tests/frontend/unit/TestDatasetGold.test.js @@ -80,7 +80,7 @@ describe('TestDatasetGold Component', () => { test('sets document title', () => { renderWithRouter(); - expect(document.title).toBe('Test Dataset GOLD - DataForgeTest'); + expect(document.title).toBe('Test Dataset GOLD - SmartDataTest'); }); test('has proper navigation structure', () => { diff --git a/tests/frontend/unit/authStorage.test.js b/tests/frontend/unit/authStorage.test.js new file mode 100644 index 0000000..5c391c7 --- /dev/null +++ b/tests/frontend/unit/authStorage.test.js @@ -0,0 +1,150 @@ +/** + * Tests for frontend/src/utils/authStorage.js + */ + +import { + saveSession, + getSession, + clearSession, + isAuthenticated, + saveProfile, + hasProfile, +} from '../../../frontend/src/utils/authStorage'; + +const SESSION_KEY = 'smartdatatest_session'; + +const mockUser = { + id: 'user-1', + name: 'Test User', + email: 'test@example.com', + role: 'tester', + avatar: null, + passwordHash: 'should-not-be-stored', +}; + +beforeEach(() => { + localStorage.clear(); + jest.restoreAllMocks(); +}); + +describe('saveSession', () => { + test('stores session without including passwordHash', () => { + const setItemSpy = jest.spyOn(Storage.prototype, 'setItem'); + saveSession(mockUser, false); + expect(setItemSpy).toHaveBeenCalled(); + const stored = JSON.parse(localStorage.getItem(SESSION_KEY)); + expect(stored).not.toHaveProperty('passwordHash'); + expect(stored.email).toBe(mockUser.email); + }); + + test('with rememberMe=true sets expiry ~7 days from now', () => { + const now = Date.now(); + saveSession(mockUser, true); + const stored = JSON.parse(localStorage.getItem(SESSION_KEY)); + const expectedExpiry = now + 7 * 24 * 60 * 60 * 1000; + expect(stored.expiresAt).toBeGreaterThanOrEqual(expectedExpiry - 5000); + expect(stored.expiresAt).toBeLessThanOrEqual(expectedExpiry + 5000); + }); + + test('without rememberMe sets expiry ~8 hours from now', () => { + const now = Date.now(); + saveSession(mockUser, false); + const stored = JSON.parse(localStorage.getItem(SESSION_KEY)); + const expectedExpiry = now + 8 * 60 * 60 * 1000; + expect(stored.expiresAt).toBeGreaterThanOrEqual(expectedExpiry - 5000); + expect(stored.expiresAt).toBeLessThanOrEqual(expectedExpiry + 5000); + }); +}); + +describe('getSession', () => { + test('returns null when storage is empty', () => { + expect(getSession()).toBeNull(); + }); + + test('returns null and clears key when JSON is malformed', () => { + localStorage.setItem(SESSION_KEY, 'not-valid-json{{{'); + expect(getSession()).toBeNull(); + }); + + test('returns null when session is expired', () => { + const expired = { + userId: 'user-1', + name: 'Test User', + email: 'test@example.com', + role: 'tester', + avatar: null, + profile: null, + loginAt: Date.now() - 10000, + expiresAt: Date.now() - 1000, // expired 1 second ago + }; + localStorage.setItem(SESSION_KEY, JSON.stringify(expired)); + expect(getSession()).toBeNull(); + }); + + test('returns session object when session is valid', () => { + const valid = { + userId: 'user-1', + name: 'Test User', + email: 'test@example.com', + role: 'tester', + avatar: null, + profile: null, + loginAt: Date.now(), + expiresAt: Date.now() + 8 * 60 * 60 * 1000, + }; + localStorage.setItem(SESSION_KEY, JSON.stringify(valid)); + const session = getSession(); + expect(session).not.toBeNull(); + expect(session.email).toBe('test@example.com'); + }); +}); + +describe('clearSession', () => { + test('removes the session key from localStorage', () => { + const removeItemSpy = jest.spyOn(Storage.prototype, 'removeItem'); + saveSession(mockUser, false); + clearSession(); + expect(removeItemSpy).toHaveBeenCalledWith(SESSION_KEY); + expect(localStorage.getItem(SESSION_KEY)).toBeNull(); + }); +}); + +describe('isAuthenticated', () => { + test('returns false when there is no session', () => { + expect(isAuthenticated()).toBe(false); + }); + + test('returns true when there is a valid session', () => { + saveSession(mockUser, false); + expect(isAuthenticated()).toBe(true); + }); +}); + +describe('saveProfile', () => { + test('updates session.profile in localStorage', () => { + saveSession(mockUser, false); + const profileData = { role: 'tester', setAt: new Date().toISOString() }; + saveProfile(profileData); + const stored = JSON.parse(localStorage.getItem(SESSION_KEY)); + expect(stored.profile).toEqual(profileData); + }); + + test('does nothing when there is no active session', () => { + // No session in localStorage + expect(() => saveProfile({ role: 'tester' })).not.toThrow(); + expect(localStorage.getItem(SESSION_KEY)).toBeNull(); + }); +}); + +describe('hasProfile', () => { + test('returns false when profile is null', () => { + saveSession(mockUser, false); + expect(hasProfile()).toBe(false); + }); + + test('returns true when profile is set', () => { + saveSession(mockUser, false); + saveProfile({ role: 'tester', setAt: new Date().toISOString() }); + expect(hasProfile()).toBe(true); + }); +}); diff --git a/tests/frontend/unit/commonTranslations.test.js b/tests/frontend/unit/commonTranslations.test.js new file mode 100644 index 0000000..41f71fa --- /dev/null +++ b/tests/frontend/unit/commonTranslations.test.js @@ -0,0 +1,84 @@ +/** + * Tests for frontend/src/utils/commonTranslations.js + */ + +import { commonTranslations } from '../../../frontend/src/utils/commonTranslations'; + +const EXPECTED_KEYS = [ + 'backToHome', + 'loading', + 'error', + 'success', + 'cancel', + 'confirm', + 'save', + 'download', + 'upload', + 'reset', +]; + +describe('commonTranslations', () => { + test('is exported as a non-null object', () => { + expect(commonTranslations).toBeDefined(); + expect(typeof commonTranslations).toBe('object'); + expect(commonTranslations).not.toBeNull(); + }); + + test('contains pt-BR locale', () => { + expect(commonTranslations).toHaveProperty('pt-BR'); + }); + + test('contains en-US locale', () => { + expect(commonTranslations).toHaveProperty('en-US'); + }); + + test.each(EXPECTED_KEYS)( + 'pt-BR has non-empty string for key "%s"', + (key) => { + expect(typeof commonTranslations['pt-BR'][key]).toBe('string'); + expect(commonTranslations['pt-BR'][key].length).toBeGreaterThan(0); + } + ); + + test.each(EXPECTED_KEYS)( + 'en-US has non-empty string for key "%s"', + (key) => { + expect(typeof commonTranslations['en-US'][key]).toBe('string'); + expect(commonTranslations['en-US'][key].length).toBeGreaterThan(0); + } + ); + + test('pt-BR and en-US have the same set of keys', () => { + const ptKeys = Object.keys(commonTranslations['pt-BR']).sort(); + const enKeys = Object.keys(commonTranslations['en-US']).sort(); + expect(ptKeys).toEqual(enKeys); + }); + + test('pt-BR backToHome is in Portuguese', () => { + expect(commonTranslations['pt-BR'].backToHome).toBe('Voltar para Home'); + }); + + test('en-US backToHome is in English', () => { + expect(commonTranslations['en-US'].backToHome).toBe('Back to Home'); + }); + + test('pt-BR loading text matches expected value', () => { + expect(commonTranslations['pt-BR'].loading).toBe('Carregando...'); + }); + + test('en-US loading text matches expected value', () => { + expect(commonTranslations['en-US'].loading).toBe('Loading...'); + }); + + test('translations can be used with fallback pattern', () => { + const lang = 'en-US'; + const tc = commonTranslations[lang] ?? commonTranslations['en-US']; + expect(tc.error).toBe('Error'); + }); + + test('unknown locale falls back to en-US via nullish coalescing', () => { + const lang = 'fr-FR'; + const tc = commonTranslations[lang] ?? commonTranslations['en-US']; + expect(tc).toEqual(commonTranslations['en-US']); + }); +}); diff --git a/tests/frontend/unit/useAuth.test.js b/tests/frontend/unit/useAuth.test.js new file mode 100644 index 0000000..6eac4ed --- /dev/null +++ b/tests/frontend/unit/useAuth.test.js @@ -0,0 +1,137 @@ +/** + * Tests for frontend/src/hooks/useAuth.js + */ + +import React from 'react'; +import { renderHook, act } from '@testing-library/react'; +import '@testing-library/jest-dom'; + +// Mock react-router-dom +const mockNavigate = jest.fn(); +jest.mock('react-router-dom', () => ({ + useNavigate: () => mockNavigate, +})); + +// Mock the users data +jest.mock('../../../frontend/src/data/users', () => ({ + REGISTERED_USERS: [ + { + id: 'user-1', + name: 'Admin User', + email: 'admin@smartdatatest.com', + password: 'admin123', + role: 'admin', + avatar: null, + }, + ], +})); + +// Mock authStorage utilities +const mockLogin = jest.fn(); +const mockLogout = jest.fn(); +const mockSaveUserProfile = jest.fn(); +jest.mock('../../../frontend/src/context/AuthContext', () => ({ + useAuthContext: () => ({ + login: mockLogin, + logout: mockLogout, + saveUserProfile: mockSaveUserProfile, + isAuthenticated: false, + hasProfile: false, + isLoading: false, + user: null, + }), +})); + +// Mock bcrypt/password check — we mock the module that verifies passwords +jest.mock('../../../frontend/src/utils/authStorage', () => ({ + saveSession: jest.fn(), + getSession: jest.fn(() => null), + clearSession: jest.fn(), + isAuthenticated: jest.fn(() => false), + saveProfile: jest.fn(), + hasProfile: jest.fn(() => false), +})); + +import useAuth from '../../../frontend/src/hooks/useAuth'; + +beforeEach(() => { + jest.clearAllMocks(); + jest.useFakeTimers(); +}); + +afterEach(() => { + jest.useRealTimers(); +}); + +describe('useAuth', () => { + test('login with correct credentials returns true', async () => { + const { result } = renderHook(() => useAuth()); + let loginResult; + await act(async () => { + const promise = result.current.handleLogin('admin@smartdatatest.com', 'admin123', false); + jest.advanceTimersByTime(1200); + loginResult = await promise; + }); + expect(loginResult).toBe(true); + }); + + test('login with wrong email returns false and sets bilingual error', async () => { + const { result } = renderHook(() => useAuth()); + let loginResult; + await act(async () => { + const promise = result.current.handleLogin('wrong@email.com', 'admin123', false); + jest.advanceTimersByTime(1200); + loginResult = await promise; + }); + expect(loginResult).toBe(false); + expect(result.current.error).not.toBeNull(); + expect(result.current.error['pt-BR']).toBeTruthy(); + expect(result.current.error['en-US']).toBeTruthy(); + }); + + test('login with wrong password returns false and sets bilingual error', async () => { + const { result } = renderHook(() => useAuth()); + let loginResult; + await act(async () => { + const promise = result.current.handleLogin('admin@smartdatatest.com', 'wrongpass', false); + jest.advanceTimersByTime(1200); + loginResult = await promise; + }); + expect(loginResult).toBe(false); + expect(result.current.error).not.toBeNull(); + expect(result.current.error['pt-BR']).toBeTruthy(); + expect(result.current.error['en-US']).toBeTruthy(); + }); + + test('clearError sets error to null', async () => { + const { result } = renderHook(() => useAuth()); + await act(async () => { + const promise = result.current.handleLogin('wrong@email.com', 'pass', false); + jest.advanceTimersByTime(1200); + await promise; + }); + expect(result.current.error).not.toBeNull(); + act(() => { + result.current.clearError(); + }); + expect(result.current.error).toBeNull(); + }); + + test('handleSaveProfile calls saveUserProfile and navigates to /', () => { + const { result } = renderHook(() => useAuth()); + act(() => { + result.current.handleSaveProfile({ role: 'tester', setAt: new Date().toISOString() }); + }); + expect(mockSaveUserProfile).toHaveBeenCalled(); + expect(mockNavigate).toHaveBeenCalledWith('/'); + }); + + test('handleLogout calls logout and navigates to /login', () => { + const { result } = renderHook(() => useAuth()); + act(() => { + result.current.handleLogout(); + }); + expect(mockLogout).toHaveBeenCalled(); + expect(mockNavigate).toHaveBeenCalledWith('/login'); + }); +}); diff --git a/tests/frontend/unit/useStats.test.js b/tests/frontend/unit/useStats.test.js new file mode 100644 index 0000000..683ae07 --- /dev/null +++ b/tests/frontend/unit/useStats.test.js @@ -0,0 +1,115 @@ +/** + * Tests for frontend/src/hooks/useStats.js + */ + +import { renderHook, waitFor } from '@testing-library/react'; +import '@testing-library/jest-dom'; + +// Mock getApiUrl +jest.mock('../../../frontend/src/config/api', () => ({ + getApiUrl: (path) => `http://localhost:5000${path}`, +})); + +const MOCK_RESPONSE = { + tests_total: 971, + datasets_total: 1180, + coverage_pct: 86, + response_sla_ms: 2000, +}; + +beforeEach(() => { + jest.spyOn(global, 'fetch').mockResolvedValue({ + ok: true, + json: async () => MOCK_RESPONSE, + }); +}); + +afterEach(() => { + jest.restoreAllMocks(); +}); + +import useStats from '../../../frontend/src/hooks/useStats'; + +describe('useStats', () => { + test('returns fallback values on initial render', () => { + jest.spyOn(global, 'fetch').mockImplementation(() => new Promise(() => {})); // hanging + const { result } = renderHook(() => useStats()); + // Before fetch resolves, fallback values are returned + expect(result.current.tests).toBe('970+'); + expect(result.current.datasets).toBe('1180+'); + expect(result.current.coverage).toBe('86%'); + expect(result.current.responseSla).toBe('<2s'); + }); + + test('updates stats after successful API response', async () => { + const { result } = renderHook(() => useStats()); + await waitFor(() => { + expect(result.current.tests).toBe('971+'); + }); + expect(result.current.datasets).toBe('1180+'); + expect(result.current.coverage).toBe('86%'); + expect(result.current.responseSla).toBe('<2s'); + }); + + test('formats response_sla_ms >= 1000 as seconds', async () => { + jest.spyOn(global, 'fetch').mockResolvedValue({ + ok: true, + json: async () => ({ ...MOCK_RESPONSE, response_sla_ms: 2000 }), + }); + const { result } = renderHook(() => useStats()); + await waitFor(() => expect(result.current.tests).toBe('971+')); + expect(result.current.responseSla).toBe('<2s'); + }); + + test('formats response_sla_ms < 1000 as milliseconds', async () => { + jest.spyOn(global, 'fetch').mockResolvedValue({ + ok: true, + json: async () => ({ ...MOCK_RESPONSE, response_sla_ms: 500 }), + }); + const { result } = renderHook(() => useStats()); + await waitFor(() => expect(result.current.tests).toBe('971+')); + expect(result.current.responseSla).toBe('<500ms'); + }); + + test('keeps fallback values when fetch throws a network error', async () => { + jest.spyOn(global, 'fetch').mockRejectedValue(new Error('Network error')); + const { result } = renderHook(() => useStats()); + // Wait a tick + await new Promise((r) => setTimeout(r, 50)); + expect(result.current.tests).toBe('970+'); + expect(result.current.datasets).toBe('1180+'); + }); + + test('keeps fallback values when API returns non-ok status', async () => { + jest.spyOn(global, 'fetch').mockResolvedValue({ ok: false, json: async () => ({}) }); + const { result } = renderHook(() => useStats()); + await new Promise((r) => setTimeout(r, 50)); + expect(result.current.tests).toBe('970+'); + }); + + test('calls correct API endpoint', async () => { + const fetchSpy = jest.spyOn(global, 'fetch'); + const { result } = renderHook(() => useStats()); + await waitFor(() => expect(result.current.tests).toBe('971+')); + expect(fetchSpy).toHaveBeenCalledWith( + 'http://localhost:5000/api/stats', + expect.objectContaining({ method: 'GET' }) + ); + }); + + test('falls back gracefully when AbortSignal.timeout is unavailable', async () => { + const originalTimeout = AbortSignal.timeout; + // Simulate environments where AbortSignal.timeout does not exist + delete AbortSignal.timeout; + try { + jest.spyOn(global, 'fetch').mockResolvedValue({ + ok: true, + json: async () => MOCK_RESPONSE, + }); + const { result } = renderHook(() => useStats()); + await waitFor(() => expect(result.current.tests).toBe('971+')); + } finally { + AbortSignal.timeout = originalTimeout; + } + }); +}); diff --git a/tests/frontend/unit/users.test.js b/tests/frontend/unit/users.test.js new file mode 100644 index 0000000..2f15697 --- /dev/null +++ b/tests/frontend/unit/users.test.js @@ -0,0 +1,71 @@ +/** + * Tests for frontend/src/data/users.js + * Validates the registered users data structure used for frontend-only demo auth. + */ + +import { REGISTERED_USERS } from '../../../frontend/src/data/users'; + +describe('REGISTERED_USERS', () => { + test('is a non-empty array', () => { + expect(Array.isArray(REGISTERED_USERS)).toBe(true); + expect(REGISTERED_USERS.length).toBeGreaterThanOrEqual(3); + }); + + test('each user has the required fields', () => { + REGISTERED_USERS.forEach((user) => { + expect(user).toHaveProperty('id'); + expect(user).toHaveProperty('name'); + expect(user).toHaveProperty('email'); + expect(user).toHaveProperty('password'); + expect(user).toHaveProperty('role'); + expect(user).toHaveProperty('avatar'); + expect(user).toHaveProperty('createdAt'); + }); + }); + + test('all user ids are unique', () => { + const ids = REGISTERED_USERS.map((u) => u.id); + const unique = new Set(ids); + expect(unique.size).toBe(ids.length); + }); + + test('all user emails are unique', () => { + const emails = REGISTERED_USERS.map((u) => u.email); + const unique = new Set(emails); + expect(unique.size).toBe(emails.length); + }); + + test('admin user exists with correct role', () => { + const admin = REGISTERED_USERS.find((u) => u.email === 'admin@smartdatatest.com'); + expect(admin).toBeDefined(); + expect(admin.role).toBe('admin'); + expect(admin.password).toBe('admin123'); + }); + + test('engineer user exists with correct role', () => { + const eng = REGISTERED_USERS.find((u) => u.email === 'engineer@smartdatatest.com'); + expect(eng).toBeDefined(); + expect(eng.role).toBe('data_eng'); + expect(eng.password).toBe('engineer123'); + }); + + test('qa user exists with correct role', () => { + const qa = REGISTERED_USERS.find((u) => u.email === 'qa@smartdatatest.com'); + expect(qa).toBeDefined(); + expect(qa.role).toBe('tester'); + expect(qa.password).toBe('qa123456'); + }); + + test('avatar field is null for all demo users', () => { + REGISTERED_USERS.forEach((user) => { + expect(user.avatar).toBeNull(); + }); + }); + + test('createdAt is a valid ISO date string', () => { + REGISTERED_USERS.forEach((user) => { + const date = new Date(user.createdAt); + expect(date.toString()).not.toBe('Invalid Date'); + }); + }); +});