diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 19da5283..e889853a 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -9,23 +9,23 @@ assignees: '' **Describe the bug** - + **Expected behavior** - + **How to reproduce** - + **Screenshots** - + **Software version** - + **Desktop (please complete the following information if relevant):** - - OS: - - Browser: - - Version: - +- OS: +- Browser: +- Version: + **Additional context** - + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 16962298..19d7f20b 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -5,7 +5,7 @@ contact_links: about: Problems with Teragrep documentation - name: Ask a question or get support url: https://github.com/teragrep/cfe_39/discussions - about: Ask a question or request support + about: Ask a question or request support - name: Report vulnerability url: https://github.com/teragrep/teragrep/security/advisories/new - about: Privately report a security vulnerability + about: Privately report a security vulnerability \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_requests.md b/.github/ISSUE_TEMPLATE/feature_requests.md new file mode 100644 index 00000000..501c73ec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_requests.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Description** + + +**Use case or motivation behind the feature request** + + +**Related issues** + + +**Additional context** + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/tasks-and-meta.md b/.github/ISSUE_TEMPLATE/tasks-and-meta.md index 17546e49..8346c565 100644 --- a/.github/ISSUE_TEMPLATE/tasks-and-meta.md +++ b/.github/ISSUE_TEMPLATE/tasks-and-meta.md @@ -8,4 +8,4 @@ assignees: '' --- **Description** - + \ No newline at end of file diff --git a/.github/workflows/upload_release.yaml b/.github/workflows/upload_release.yaml new file mode 100644 index 00000000..4138e06f --- /dev/null +++ b/.github/workflows/upload_release.yaml @@ -0,0 +1,43 @@ +name: Upload Release + +on: + release: + types: [published] + +jobs: + upload: + name: Upload + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + server-id: github + settings-path: ${{ github.workspace }} + + + - name: Package jar + run: mvn --batch-mode -Drevision=${{ github.event.release.tag_name }} -Dsha1= -Dchangelist= clean package + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Package rpm + run: cd rpm/ && mvn --batch-mode -Drevision=${{ github.event.release.tag_name }} -Dsha1= -Dchangelist= -f rpm.pom.xml package + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Attach rpm to release + uses: softprops/action-gh-release@v1 + with: + files: | + rpm/target/rpm/com.teragrep-cfe_39/RPMS/noarch/com.teragrep-cfe_39-*.noarch.rpm + target/cfe_39-*-jar-with-dependencies.jar \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2f435308..cc5ebc0c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ buildNumber.properties .project # JDT-specific (Eclipse Java Development Tools) .classpath + +src/main/java/com/teragrep/cfe_39/avro/SyslogRecord.java \ No newline at end of file diff --git a/README.adoc b/README.adoc new file mode 100644 index 00000000..4bd6c77a --- /dev/null +++ b/README.adoc @@ -0,0 +1,41 @@ + +# CFE_39 + +This is a HDFS Data Ingestion module for PTH_06 use. + +## Features + +Implements almost real-time datasource that allows reading latest data from Kafka (last few records), semi-latest data from HDFS (Last Few Days) and old data from S3 Archive. + +## Documentation + +See the official documentation on https://docs.teragrep.com[docs.teragrep.com]. + +## How to [compile/use/implement] + +`mvn clean package` + +application.properties, config.jaas and log4j2.properties files have to be created to use this module. +By default, application.properties file must be placed in /opt/teragrep/cfe_39/etc/ directory. +The application.properties is used to define the directory where the other files must be placed. + +Example configuration files available in cfe_39/rpm/resources/ directory. + +## Contributing + +You can involve yourself with our project by https://github.com/teragrep/cfe_39/issues/new/choose[opening an issue] or submitting a pull request. + +Contribution requirements: + +. *All changes must be accompanied by a new or changed test.* If you think testing is not required in your pull request, include a sufficient explanation as why you think so. +. Security checks must pass +. Pull requests must align with the principles and http://www.extremeprogramming.org/values.html[values] of extreme programming. +. Pull requests must follow the principles of Object Thinking and Elegant Objects (EO). + +Read more in our https://github.com/teragrep/teragrep/blob/main/contributing.adoc[Contributing Guideline]. + +### Contributor License Agreement + +Contributors must sign https://github.com/teragrep/teragrep/blob/main/cla.adoc[Teragrep Contributor License Agreement] before a pull request is accepted to organization's repositories. + +You need to submit the CLA only once. After submitting the CLA you can contribute to all Teragrep's repositories. \ No newline at end of file diff --git a/eclipse-java-formatter.xml b/eclipse-java-formatter.xml new file mode 100644 index 00000000..1e4e9905 --- /dev/null +++ b/eclipse-java-formatter.xml @@ -0,0 +1,450 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/license-header b/license-header new file mode 100644 index 00000000..d14a1f51 --- /dev/null +++ b/license-header @@ -0,0 +1,45 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 00000000..bda84206 --- /dev/null +++ b/pom.xml @@ -0,0 +1,385 @@ + + + 4.0.0 + com.teragrep + cfe_39 + ${revision}${sha1}${changelist} + jar + cfe_39 + + -SNAPSHOT + 3.3.6 + 1.8 + 1.8 + 1.8 + 4.2.8 + UTF-8 + 0.16.0 + 0.0.1 + + + + + io.dropwizard.metrics + metrics-core + ${metrics.version} + + + io.dropwizard.metrics + metrics-jmx + ${metrics.version} + + + io.prometheus + simpleclient + ${prometheus-simpleclient.version} + + + io.prometheus + simpleclient_dropwizard + ${prometheus-simpleclient.version} + + + io.prometheus + simpleclient_servlet + ${prometheus-simpleclient.version} + + + io.prometheus + simpleclient_hotspot + ${prometheus-simpleclient.version} + + + org.junit.jupiter + junit-jupiter-engine + 5.7.1 + test + + + org.junit.platform + junit-platform-launcher + 1.7.1 + test + + + org.junit.jupiter + junit-jupiter-api + 5.7.1 + test + + + org.junit.jupiter + junit-jupiter + 5.7.1 + test + + + com.teragrep + rlo_06 + 9.0.1 + + + + org.apache.kafka + kafka-clients + 3.4.0 + + + + org.xerial.snappy + snappy-java + 1.1.10.5 + + + org.apache.avro + avro + 1.11.3 + + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + + + + org.apache.hadoop + hadoop-minicluster + ${hadoop.version} + test + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version} + test + + + + org.mockito + mockito-core + 4.11.0 + test + + + + org.apache.logging.log4j + log4j-slf4j2-impl + 2.20.0 + + + org.apache.logging.log4j + log4j-core + 2.20.0 + + + org.slf4j + slf4j-api + 2.0.7 + + + + ${project.artifactId}-${revision}${changelist}${sha1} + + + org.apache.rat + apache-rat-plugin + 0.16.1 + false + + false + false + + + Teragrep + Affero General Public License v3 + + + + + + + Suomen Kanuuna Oy + 2024 + + HDFS Data Ingestion for PTH_06 use CFE-39 + + Teragrep + + + true + false + + + .git/** + .gitattributes + .gitignore + .gitmodules + + .github/workflows/* + .github/ISSUE_TEMPLATE/* + toolchains.xml + settings.xml + + pom.xml + eclipse-java-formatter.xml + + README.adoc + + license-header + src/main/avro/KafkaRecord.avsc + src/main/assembly/jar-with-dependencies.xml + src/test/resources/broken.application.properties + src/test/resources/valid.application.properties + src/test/resources/failProcessing.application.properties + rpm/resources/config.jaas + rpm/resources/log4j2.properties + rpm/resources/application.properties + rpm/resources/cfe_39.service + rpm/rpm.pom.xml + src/main/java/com/teragrep/cfe_39/avro/SyslogRecord.java + + + + + + check + + test + + + + + org.apache.avro + avro-maven-plugin + 1.11.3 + + + + schema + + generate-sources + + ${project.basedir}/src/main/avro/ + ${project.basedir}/src/main/java/ + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + -Xlint:all + ${java.version} + ${java.version} + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + + true + true + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.6.0 + + + src/main/assembly/jar-with-dependencies.xml + + + + com.teragrep.cfe_39.Main + true + + + + + + make-assembly + + single + + package + + + + + org.apache.maven.plugins + maven-enforcer-plugin + 3.4.1 + + + enforce-maven + + enforce + + + + + 3.2.5 + + + + + + enforce + none + + + + + com.diffplug.spotless + spotless-maven-plugin + 2.30.0 + + + + ${project.basedir}/eclipse-java-formatter.xml + 4.10.0 + + + + ${project.basedir}/license-header + + + src/main/java/com/teragrep/cfe_39/avro/SyslogRecord.java + + + + + + UTF-8 + \n + true + false + 2 + recommended_2008_06 + true + true + true + + + + + + .gitattributes + .gitignore + + + + + true + 4 + + + + + + + + check + + compile + + + + + org.jacoco + jacoco-maven-plugin + 0.8.12 + + + + prepare-agent + + + + report + + report + + prepare-package + + + + + + diff --git a/rpm/resources/application.properties b/rpm/resources/application.properties new file mode 100644 index 00000000..fc5100cb --- /dev/null +++ b/rpm/resources/application.properties @@ -0,0 +1,52 @@ +# Kafka security configuration file +java.security.auth.login.config=/opt/teragrep/cfe_39/etc/config.jaas +# Logger settings +log4j2.configurationFile=/opt/teragrep/cfe_39/etc/log4j2.properties +# What topics are searched from kafka, regex +queueTopicPattern=^testConsumerTopic-*$ +# Number of consumers created to the consumer groups +numOfConsumers=2 +# Kafka bootstrap servers +consumer.bootstrap.servers=test +# Offset, should not be touched +consumer.auto.offset.reset=earliest +# Autocommit, should not be touched +consumer.enable.auto.commit=false +# Consumer group id, this is to track the progress of reading hte topic +consumer.group.id=cfe_39 +# Used security protocol and mechanism +consumer.security.protocol=SASL_PLAINTEXT +consumer.sasl.mechanism=PLAIN +# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger +consumer.max.poll.records=500 +# How much data can be fetched in one go +consumer.fetch.max.bytes=1073741820 +# How long for request before timing out. Note that too big max poll records size can cause this to trigger +consumer.request.timeout.ms=300000 +consumer.max.poll.interval.ms=300000 +# For testing only, remove for prod. +consumer.useMockKafkaConsumer=true +# Directory where AVRO files are constructed for HDFS +queueDirectory=/opt/teragrep/cfe_39/etc/AVRO/ +# The maximum file size for AVRO-files that are to be stored in HDFS database. +maximumFileSize=60800000 +# Boolean for deciding if records not in RFC5424 should be skipped or not. +skipNonRFC5424Records=true +# Boolean for deciding if empty RFC5424 records should be skipped or not. +skipEmptyRFC5424Records=true +# HDFS pruning offset, prunes files older than the given milliseconds. +pruneOffset=172800000 +# HDFS uri +hdfsuri=hdfs://localhost:45937/ +# Kerberos +java.security.krb5.kdc=test +java.security.krb5.realm=test +hadoop.security.authentication=test +hadoop.security.authorization=test +dfs.namenode.kerberos.principal.pattern=test +KerberosKeytabUser=test +KerberosKeytabPath=test +dfs.client.use.datanode.hostname=false +kerberosLoginAutorenewal=true +dfs.data.transfer.protection=test +dfs.encrypt.data.transfer.cipher.suites=test \ No newline at end of file diff --git a/rpm/resources/cfe_39.service b/rpm/resources/cfe_39.service new file mode 100644 index 00000000..6bb58d56 --- /dev/null +++ b/rpm/resources/cfe_39.service @@ -0,0 +1,12 @@ +[Unit] +Description=com.teragrep.cfe_39 +ConditionPathExists=/opt/teragrep/cfe_39/lib/cfe_39.jar + +[Service] +ExecStart=/usr/lib/jvm/jre-1.8.0-openjdk/bin/java -jar /opt/teragrep/cfe_39/lib/cfe_39.jar +User=srv-cfe_39 +WorkingDirectory=/opt/teragrep/cfe_39 + +[Install] +WantedBy=multi-user.target + diff --git a/rpm/resources/config.jaas b/rpm/resources/config.jaas new file mode 100644 index 00000000..045b8540 --- /dev/null +++ b/rpm/resources/config.jaas @@ -0,0 +1,9 @@ +KafkaServer { + org.apache.kafka.common.security.plain.PlainLoginModule required + username="admin" + password="admin" + user_admin="admin" + user_alice="alice" + user_bob="bob" + user_charlie="charlie"; +}; \ No newline at end of file diff --git a/rpm/resources/log4j2.properties b/rpm/resources/log4j2.properties new file mode 100644 index 00000000..9ec3d8ec --- /dev/null +++ b/rpm/resources/log4j2.properties @@ -0,0 +1,10 @@ +appender.console.type = Console +appender.console.name = ConsoleLogger +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n +logging.level.org.apache.kafka=WARN +logging.level.io.confluent.kafka=WARN +rootLogger.level = INFO +rootLogger.appenderRef.stdout.ref = ConsoleLogger +logger.kafka.name = org.apache.kafka +logger.kafka.level = warn \ No newline at end of file diff --git a/rpm/rpm.pom.xml b/rpm/rpm.pom.xml new file mode 100644 index 00000000..fa4fb42a --- /dev/null +++ b/rpm/rpm.pom.xml @@ -0,0 +1,162 @@ + + + rpm + 4.0.0 + cfe_39 + ${revision}${sha1}${changelist} + cfe_39 + cfe_39 + com.teragrep + + UTF-8 + 1.8 + 1.8 + 1.8 + 0.0.1 + -SNAPSHOT + + + + ${project.basedir}/target + + + maven-enforcer-plugin + 3.4.1 + + + enforce + none + + + enforce-maven + + enforce + + + + + 3.2.5 + + + + + + + + org.codehaus.mojo + rpm-maven-plugin + 2.2.0 + true + + + default-rpm + + rpm + + package + + + + ${project.groupId}-${project.artifactId} + ${project.groupId}-${project.artifactId} + ${project.version} + ${env.BUILD_ID} + Proprietary + teragrep Log Management Suite + https://teragrep.com/ + teragrep <servicedesk@teragrep.com> + teragrep/LogManagementSuite + false + srv-cfe_39 + srv-cfe_39 + 0644 + 0755 + + _build_id_links none + __provides_exclude ^osgi\\(.*$ + __requires_exclude ^osgi\\(.*$ + + + + /opt/teragrep/${project.artifactId}/lib + true + 755 + 755 + srv-cfe_39 + srv-cfe_39 + true + + + ${project.basedir}/../target/cfe_39-${revision}${sha1}${changelist}-jar-with-dependencies.jar + cfe_39.jar + + + + + /opt/teragrep/${project.artifactId}/etc + true + noreplace + + + ${project.basedir}/../rpm/resources/application.properties + + + ${project.basedir}/../rpm/resources/config.jaas + + + ${project.basedir}/../rpm/resources/log4j2.properties + + + + + /usr/lib/systemd/system + false + + + ${project.basedir}/resources/cfe_39.service + + + + + + java-1.8.0-openjdk + + + + + + + + org.apache.maven.plugins + maven-deploy-plugin + 3.1.1 + + true + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + default-compile + compile + + compile + + + true + + + + + + + diff --git a/src/main/assembly/jar-with-dependencies.xml b/src/main/assembly/jar-with-dependencies.xml new file mode 100644 index 00000000..ff9c3a29 --- /dev/null +++ b/src/main/assembly/jar-with-dependencies.xml @@ -0,0 +1,21 @@ + + jar-with-dependencies + + jar + + false + + + metaInf-services + + + + + true + true + runtime + + + \ No newline at end of file diff --git a/src/main/avro/KafkaRecord.avsc b/src/main/avro/KafkaRecord.avsc new file mode 100644 index 00000000..2b55cad6 --- /dev/null +++ b/src/main/avro/KafkaRecord.avsc @@ -0,0 +1,15 @@ +{"namespace": "com.teragrep.cfe_39.avro", + "type": "record", + "name": "SyslogRecord", + "fields": [ + {"name": "timestamp", "type": "long"}, + {"name": "directory", "type": "string"}, + {"name": "stream", "type": "string"}, + {"name": "host", "type": "string"}, + {"name": "input", "type": "string"}, + {"name": "partition", "type": "string"}, + {"name": "offset", "type": "long"}, + {"name": "origin", "type": "string"}, + {"name": "payload", "type": "string"} + ] +} \ No newline at end of file diff --git a/src/main/java/com/teragrep/cfe_39/Config.java b/src/main/java/com/teragrep/cfe_39/Config.java new file mode 100644 index 00000000..c29e1ed9 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/Config.java @@ -0,0 +1,296 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import org.apache.logging.log4j.core.config.Configurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Enumeration; +import java.util.Properties; + +public class Config { + + private final String queueTopicPattern; + private final Properties kafkaConsumerProperties; + private static final Logger LOGGER = LoggerFactory.getLogger(Config.class); + private final String hdfsPath; + private String hdfsuri; + private final String queueDirectory; + private final String kerberosHost; + private final String kerberosRealm; + private final String kerberosPrincipal; + private final String hadoopAuthentication; + private final String hadoopAuthorization; + private final String kerberosKeytabUser; + private final String kerberosKeytabPath; + private final String kerberosLoginAutorenewal; + private final String kerberosTestMode; + private long maximumFileSize; + private final int numOfConsumers; + private final long pruneOffset; + private final boolean skipNonRFC5424Records; + private final boolean skipEmptyRFC5424Records; + private final String dfsDataTransferProtection; + private final String dfsEncryptDataTransferCipherSuites; + + public Config() throws IOException { + Properties properties = new Properties(); + Path configPath = Paths + .get(System.getProperty("cfe_39.config.location", "/opt/teragrep/cfe_39/etc/application.properties")); + LOGGER.info("Loading application config <[{}]>", configPath.toAbsolutePath()); + + try (InputStream inputStream = Files.newInputStream(configPath)) { + properties.load(inputStream); + LOGGER.debug("Got configuration: <{}>", properties); + } + + // HDFS + this.hdfsPath = properties.getProperty("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/"); + this.hdfsuri = properties.getProperty("hdfsuri"); + if (this.hdfsuri == null) { + throw new IllegalArgumentException("hdfsuri not set"); + } + + // HDFS pruning + this.pruneOffset = Long.parseLong(properties.getProperty("pruneOffset", "172800000")); + if (this.pruneOffset <= 0) { + throw new IllegalArgumentException("pruneOffset must be set to >0, got " + pruneOffset); + } + + // AVRO + this.queueDirectory = properties.getProperty("queueDirectory", System.getProperty("user.dir") + "/etc/AVRO/"); + this.maximumFileSize = Long.parseLong(properties.getProperty("maximumFileSize", "60800000")); + if (this.maximumFileSize <= 0) { + throw new IllegalArgumentException("maximumFileSize must be set to >0, got " + maximumFileSize); + } + + // kerberos + this.kerberosHost = properties.getProperty("java.security.krb5.kdc"); + if (this.kerberosHost == null) { + throw new IllegalArgumentException("kerberosHost not set"); + } + this.kerberosRealm = properties.getProperty("java.security.krb5.realm"); + if (this.kerberosRealm == null) { + throw new IllegalArgumentException("kerberosRealm not set"); + } + this.hadoopAuthentication = properties.getProperty("hadoop.security.authentication"); + if (this.hadoopAuthentication == null) { + throw new IllegalArgumentException("hadoopAuthentication not set"); + } + this.hadoopAuthorization = properties.getProperty("hadoop.security.authorization"); + if (this.hadoopAuthorization == null) { + throw new IllegalArgumentException("hadoopAuthorization not set"); + } + this.kerberosPrincipal = properties.getProperty("dfs.namenode.kerberos.principal.pattern"); + if (this.kerberosPrincipal == null) { + throw new IllegalArgumentException("kerberosPrincipal not set"); + } + this.kerberosKeytabUser = properties.getProperty("KerberosKeytabUser"); + if (this.kerberosKeytabUser == null) { + throw new IllegalArgumentException("kerberosKeytabUser not set"); + } + this.kerberosKeytabPath = properties.getProperty("KerberosKeytabPath"); + if (this.kerberosKeytabPath == null) { + throw new IllegalArgumentException("kerberosKeytabPath not set"); + } + this.kerberosLoginAutorenewal = properties.getProperty("kerberosLoginAutorenewal"); + if (this.kerberosLoginAutorenewal == null) { + throw new IllegalArgumentException("kerberosLoginAutorenewal not set"); + } + this.kerberosTestMode = properties.getProperty("dfs.client.use.datanode.hostname", "false"); + + this.dfsDataTransferProtection = properties.getProperty("dfs.data.transfer.protection"); + if (this.dfsDataTransferProtection == null) { + throw new IllegalArgumentException("dfsDataTransferProtection not set"); + } + this.dfsEncryptDataTransferCipherSuites = properties.getProperty("dfs.encrypt.data.transfer.cipher.suites"); + if (this.dfsEncryptDataTransferCipherSuites == null) { + throw new IllegalArgumentException("dfsEncryptDataTransferCipherSuites not set"); + } + + // kafka + this.queueTopicPattern = properties.getProperty("queueTopicPattern", "^.*$"); + this.numOfConsumers = Integer.parseInt(properties.getProperty("numOfConsumers", "1")); + + // skip non RFC5424 records + this.skipNonRFC5424Records = properties.getProperty("skipNonRFC5424Records", "false").equalsIgnoreCase("true"); + + // skip empty RFC5424 records + this.skipEmptyRFC5424Records = properties + .getProperty("skipEmptyRFC5424Records", "false") + .equalsIgnoreCase("true"); + + this.kafkaConsumerProperties = loadSubProperties(properties, "consumer."); + String loginConfig = properties + .getProperty("java.security.auth.login.config", System.getProperty("user.dir") + "/rpm/resources/config.jaas"); + if (loginConfig == null) { + throw new IOException("Property java.security.auth.login.config does not exist"); + } + if (!(new File(loginConfig)).isFile()) { + throw new IOException("File '" + loginConfig + "' set by java.security.auth.login.config does not exist"); + } + + // Just for loggers to work + Path log4j2Config = Paths + .get(properties.getProperty("log4j2.configurationFile", System.getProperty("user.dir") + "/rpm/resources/log4j2.properties")); + LOGGER.info("Loading log4j2 config from <[{}]>", log4j2Config.toRealPath()); + Configurator.reconfigure(log4j2Config.toUri()); + } + + private Properties loadSubProperties(Properties properties, String prefix) { + Properties subProperties = new Properties(); + + Enumeration keys = properties.keys(); + while (keys.hasMoreElements()) { + String key = String.valueOf(keys.nextElement()); + if (key.startsWith(prefix)) { + String value = properties.getProperty(key); + String subKey = key.replaceFirst(prefix, ""); + subProperties.put(subKey, value); + } + } + return subProperties; + } + + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsuri(String input) { + this.hdfsuri = input; + } + + public String getHdfsuri() { + return hdfsuri; + } + + public String getQueueDirectory() { + return queueDirectory; + } + + public String getQueueTopicPattern() { + return queueTopicPattern; + } + + public Properties getKafkaConsumerProperties() { + return kafkaConsumerProperties; + } + + public String getKerberosHost() { + return kerberosHost; + } + + public String getKerberosRealm() { + return kerberosRealm; + } + + public String getKerberosPrincipal() { + return kerberosPrincipal; + } + + public String getHadoopAuthentication() { + return hadoopAuthentication; + } + + public String getHadoopAuthorization() { + return hadoopAuthorization; + } + + public String getKerberosKeytabUser() { + return kerberosKeytabUser; + } + + public String getKerberosKeytabPath() { + return kerberosKeytabPath; + } + + public String getKerberosTestMode() { + return kerberosTestMode; + } + + public long getMaximumFileSize() { + return maximumFileSize; + } + + public void setMaximumFileSize(long maximumFileSize) { + this.maximumFileSize = maximumFileSize; + } + + public int getNumOfConsumers() { + return numOfConsumers; + } + + public long getPruneOffset() { + return pruneOffset; + } + + public boolean getSkipNonRFC5424Records() { + return skipNonRFC5424Records; + } + + public boolean getSkipEmptyRFC5424Records() { + return skipEmptyRFC5424Records; + } + + public String getKerberosLoginAutorenewal() { + return kerberosLoginAutorenewal; + } + + public String getDfsDataTransferProtection() { + return dfsDataTransferProtection; + } + + public String getDfsEncryptDataTransferCipherSuites() { + return dfsEncryptDataTransferCipherSuites; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/Main.java b/src/main/java/com/teragrep/cfe_39/Main.java new file mode 100644 index 00000000..bb4e633c --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/Main.java @@ -0,0 +1,75 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +public class Main { + + private static final Logger LOGGER = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) throws Exception { + Config config = null; + try { + config = new Config(); + } + catch (IOException e) { + LOGGER.error("Can't load config: ", e); + System.exit(1); + } + catch (IllegalArgumentException e) { + LOGGER.error("Got invalid config: ", e); + System.exit(1); + } + LOGGER.info("Running main program"); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + hdfsDataIngestion.run(); + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java new file mode 100644 index 00000000..ae519335 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java @@ -0,0 +1,491 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.google.gson.*; +import com.teragrep.cfe_39.Config; +import com.teragrep.cfe_39.avro.SyslogRecord; +import com.teragrep.cfe_39.consumers.kafka.queue.WritableQueue; +import com.teragrep.cfe_39.metrics.topic.TopicCounter; +import com.teragrep.cfe_39.metrics.DurationStatistics; +import com.teragrep.rlo_06.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.ZonedDateTime; +import java.util.List; +import java.util.function.Consumer; + +import java.nio.ByteBuffer; + +/* The kafka stream should first be deserialized using rlo_06 and then serialized again using avro and stored in HDFS. + The target where the record is stored in HDFS is based on the topic, partition and offset. ie. topic_name/0.123456 where offset is 123456 + The mock consumer is activated for testing using the configuration file: readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")*/ + +public class DatabaseOutput implements Consumer> { + + private static final Logger LOGGER = LoggerFactory.getLogger(DatabaseOutput.class); + private final RFC5424Frame rfc5424Frame = new RFC5424Frame(false); + + private final String table; + + private final DurationStatistics durationStatistics; + private final TopicCounter topicCounter; + + private long lastTimeCalled = Instant.now().toEpochMilli(); + + private SyslogAvroWriter syslogAvroWriter; + private final long maximumFileSize; + private final WritableQueue writableQueue; + private final ByteBuffer sourceConcatenationBuffer; + private final SDVector teragrepStreamName; + private final SDVector teragrepDirectory; + private final SDVector eventNodeSourceSource; + private final SDVector eventNodeRelaySource; + private final SDVector eventNodeSourceSourceModule; + private final SDVector eventNodeRelaySourceModule; + private final SDVector eventNodeSourceHostname; + private final SDVector eventNodeRelayHostname; + private final SDVector originHostname; + private File syslogFile; + private final Config config; + private final boolean skipNonRFC5424Records; + private final boolean skipEmptyRFC5424Records; + + public DatabaseOutput( + Config config, + String table, + DurationStatistics durationStatistics, + TopicCounter topicCounter + ) { + this.config = config; + this.table = table; + this.durationStatistics = durationStatistics; + this.topicCounter = topicCounter; + this.maximumFileSize = config.getMaximumFileSize(); + + // queueDirectory and queueNamePrefix are only used for temporarily storing the AVRO-serialized files before committing them to HDFS when the file size reaches the threshold (or all records are processed). + this.writableQueue = new WritableQueue(config.getQueueDirectory(), table); + + this.sourceConcatenationBuffer = ByteBuffer.allocateDirect(256 * 1024); + teragrepStreamName = new SDVector("teragrep@48577", "streamname"); + teragrepDirectory = new SDVector("teragrep@48577", "directory"); + this.eventNodeSourceSource = new SDVector("event_node_source@48577", "source"); + this.eventNodeRelaySource = new SDVector("event_node_relay@48577", "source"); + this.eventNodeSourceSourceModule = new SDVector("event_node_source@48577", "source_module"); + this.eventNodeRelaySourceModule = new SDVector("event_node_relay@48577", "source_module"); + this.eventNodeSourceHostname = new SDVector("event_node_source@48577", "hostname"); + this.eventNodeRelayHostname = new SDVector("event_node_relay@48577", "hostname"); + this.originHostname = new SDVector("origin@48577", "hostname"); + this.skipNonRFC5424Records = config.getSkipNonRFC5424Records(); + this.skipEmptyRFC5424Records = config.getSkipEmptyRFC5424Records(); + } + + // Checks that the filesize stays under the defined maximum file size. If the file is about to go over target limit commits the file to HDFS and returns true, otherwise does nothing and returns false. + private boolean writeToHdfs(long fileSize, JsonObject recordOffsetObjectJo) { + try { + // If the syslogAvroWriter is already initialized, check the filesize so it doesn't go above maximumFileSize. + if (fileSize > maximumFileSize) { + // file too large for adding the new record to it, write the still adequately sized AVRO-file to the HDFS database and create a new empty AVRO-file. + + // This part closes the writing of now "complete" AVRO-file and stores the file to HDFS. + syslogAvroWriter.close(); + try (HDFSWrite writer = new HDFSWrite(config, recordOffsetObjectJo)) { + writer.commit(syslogFile); // commits the final AVRO-file to HDFS. + } + return true; + } + } + catch (IOException ioException) { + throw new UncheckedIOException(ioException); + } + return false; + } + + private long rfc3339ToEpoch(ZonedDateTime zonedDateTime) { + final Instant instant = zonedDateTime.toInstant(); + + final long MICROS_PER_SECOND = 1000L * 1000L; + final long NANOS_PER_MICROS = 1000L; + final long sec = Math.multiplyExact(instant.getEpochSecond(), MICROS_PER_SECOND); + + return Math.addExact(sec, instant.getNano() / NANOS_PER_MICROS); + } + + /* Input parameter is a list of RecordOffsetObjects. Each object contains a record and its metadata (topic, partition and offset). + Each partition will get their set of exclusive AVRO-files in HDFS. + The target where the record is stored in HDFS is based on the topic, partition and last offset. ie. topic_name/0.123456 where last written record's offset is 123456. + AVRO-file with a path/name that starts with topic_name/0.X should only contain records from the 0th partition of topic named topic_name, topic_name/1.X should only contain records from 1st partition, etc. + AVRO-files are created dynamically, thus it is not known which record (and its offset) is written to the file last before committing it to HDFS. The final name for the HDFS file is decided only when the file is committed to HDFS.*/ + @Override + public void accept(List recordOffsetObjectList) { + long thisTime = Instant.now().toEpochMilli(); + long ftook = thisTime - lastTimeCalled; + topicCounter.setKafkaLatency(ftook); + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Fuura searching your batch for <[{}]> with records <{}> and took <{}> milliseconds. <{}> EPS. ", + table, recordOffsetObjectList.size(), (ftook), + (recordOffsetObjectList.size() * 1000L / ftook) + ); + } + long batchBytes = 0L; + + /* The recordOffsetObjectList loop will go through all the objects in the list. + While it goes through the list, the contents of the objects are serialized into an AVRO-file. + When the file size is about to go above 64M, commit the file into HDFS using the latest topic/partition/offset values as the filename and start fresh with a new empty AVRO-file. + Serialize the object that was going to make the file go above 64M into the now empty AVRO-file and continue the loop. + TODO: If the prod-environment recordOffsetObjectList ordering is different from what it is in the test environment, add a function that reorders the list based on partition and offset (or better yet, make several AVRO-files that are being used at the same time rather than doing it one AVRO-file at a time as the offset ordering within partitions should always be correct in all scenarios).*/ + Offset lastObject = new NullOffset(); // Set to null object before initializing as RecordOffsetObject. + JsonObject lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject(); + long start = Instant.now().toEpochMilli(); // Starts measuring performance here. Measures how long it takes to process the whole recordOffsetObjectList. + // This loop goes through all the records of the mock data in a single session. + for (RecordOffset recordOffsetObject : recordOffsetObjectList) { + JsonObject recordOffsetObjectJo = JsonParser + .parseString(recordOffsetObject.offsetToJSON()) + .getAsJsonObject(); + // Initializing syslogAvroWriter and lastObject. + if (syslogAvroWriter == null && lastObject.isNull()) { + try { + writableQueue + .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString()); + syslogFile = writableQueue.getNextWritableFile(); + // The HDFS filename is only finalized when the AVRO-serialized file is finalized, because every Kafka-record added to the file is going to change the offset that is going to be used for the filename. + syslogAvroWriter = new SyslogAvroWriter(syslogFile); + lastObject = recordOffsetObject; + lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject(); + } + catch (IOException ioException) { + throw new IllegalArgumentException(ioException); + } + } + else { + try { + if ( + lastObjectJo.get("topic").getAsString().equals(recordOffsetObjectJo.get("topic").getAsString()) + && lastObjectJo.get("partition").getAsString().equals(recordOffsetObjectJo.get("partition").getAsString()) + ) { + // Records left to consume in the current partition. + boolean fileCommitted = writeToHdfs(syslogAvroWriter.getFileSize(), lastObjectJo); + if (fileCommitted) { + // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the size limit defined in config. + writableQueue + .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString()); + syslogFile = writableQueue.getNextWritableFile(); + syslogAvroWriter = new SyslogAvroWriter(syslogFile); + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Target file size reached, file <{}> stored to <{}> in HDFS", syslogFile + .getName(), + lastObjectJo.get("topic").getAsString() + "/" + lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString() + ); + } + } + else { + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Target file size not yet reached, continuing writing records to <{}>.", + syslogFile.getName() + ); + } + } + } + else { + // Previous partition was fully consumed. Commit file to HDFS and create a new AVRO-file. + syslogAvroWriter.close(); + HDFSWrite writer = new HDFSWrite(config, lastObjectJo); + writer.commit(syslogFile); + + // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the 64M size limit. + writableQueue + .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString()); + syslogFile = writableQueue.getNextWritableFile(); + syslogAvroWriter = new SyslogAvroWriter(syslogFile); + } + } + catch (IOException ioException) { + throw new UncheckedIOException(ioException); + } + } + + byte[] byteArray = recordOffsetObject.getRecord(); // loads the byte[] contained in recordOffsetObject.getRecord() to byteArray. + if (byteArray == null) { + if (skipEmptyRFC5424Records) { + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Skipping processing an empty non RFC5424 record. Record metadata: {}", + recordOffsetObject.offsetToJSON() + ); + } + continue; + } + else { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Null record metadata: {}", recordOffsetObject.offsetToJSON()); + } + syslogFile.delete(); // Clean up + throw new NullPointerException("Record with null content detected during processing."); + } + + } + InputStream inputStream = new ByteArrayInputStream(byteArray); + rfc5424Frame.load(inputStream); + try { + if (rfc5424Frame.next()) { + /*rfc5424Frame has loaded the record data, it's ready for deserialization. + Implement AVRO serialization for the Kafka records here, preparing the data for writing to HDFS. + Write all the data into a file using AVRO. + The size of each AVRO-serialized file should be as close to 64M as possible.*/ + + batchBytes = batchBytes + byteArray.length; + + // input + final byte[] source = eventToSource(); + + // origin + final byte[] origin = eventToOrigin(); + + // Format: Use AVRO format with syslog columns as indexed ones + final long epochMicros = rfc3339ToEpoch( + new RFC5424Timestamp(rfc5424Frame.timestamp).toZonedDateTime() + ); + SyslogRecord syslogRecord = SyslogRecord + .newBuilder() + .setTimestamp(epochMicros) + .setPayload(rfc5424Frame.msg.toString()) + .setDirectory(rfc5424Frame.structuredData.getValue(teragrepDirectory).toString()) + .setStream(rfc5424Frame.structuredData.getValue(teragrepStreamName).toString()) // Or is sourcetype/stream supposed to be rfc5424Frame.appName.toString() instead? + .setHost(rfc5424Frame.hostname.toString()) + .setInput(new String(source, StandardCharsets.UTF_8)) + .setPartition(recordOffsetObjectJo.get("partition").getAsString()) + .setOffset(recordOffsetObjectJo.get("offset").getAsLong()) + .setOrigin(new String(origin, StandardCharsets.UTF_8)) + .build(); + + // Calculate the size of syslogRecord that is going to be written to syslogAvroWriter-file. + long capacity = syslogRecord.toByteBuffer().capacity(); + // Check if there is still room in syslogAvroWriter for another syslogRecord. Commit syslogAvroWriter to HDFS if no room left, emptying it out in the process. + boolean fileCommitted = writeToHdfs(syslogAvroWriter.getFileSize() + capacity, lastObjectJo); + if (fileCommitted) { + // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the size limit defined in config. + writableQueue + .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString()); + syslogFile = writableQueue.getNextWritableFile(); + syslogAvroWriter = new SyslogAvroWriter(syslogFile); + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Target file size reached, file <{}> stored to <{}/{}.{}> in HDFS", + syslogFile.getName(), lastObjectJo.get("topic").getAsString(), lastObjectJo.get("partition").getAsString(), lastObjectJo.get("offset").getAsString() + ); + } + } + else { + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Target file size not yet reached, continuing writing records to <{}>.", + syslogFile.getName() + ); + } + } + // Add syslogRecord to syslogAvroWriter which has room for new syslogRecord. + syslogAvroWriter.write(syslogRecord); + lastObject = recordOffsetObject; + lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject(); + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + catch (ParseException e) { + if (skipNonRFC5424Records) { + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Skipping processing a non RFC5424 record, record metadata: {}. Exception information: ", + recordOffsetObject.offsetToJSON(), e + ); + } + continue; + } + else { + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Record metadata that is causing ParseException: {}.", + recordOffsetObject.offsetToJSON() + ); + } + syslogFile.delete(); // Clean up + throw new RuntimeException(e); + } + } + } + + // Handle the "leftover" syslogRecords from the loop. + try { + if (syslogAvroWriter != null && !lastObject.isNull()) { + syslogAvroWriter.close(); + try (HDFSWrite writer = new HDFSWrite(config, lastObjectJo)) { + writer.commit(syslogFile); // commits the final AVRO-file to HDFS. + } + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + + // Measures performance of code that is between start and end. + long end = Instant.now().toEpochMilli(); + + long took = (end - start); + topicCounter.setDatabaseLatency(took); + + if (took == 0) { + took = 1; + } + long rps = recordOffsetObjectList.size() * 1000L / took; + topicCounter.setRecordsPerSecond(rps); + + long bps = batchBytes * 1000 / took; + topicCounter.setBytesPerSecond(bps); + + durationStatistics.addAndGetRecords(recordOffsetObjectList.size()); + durationStatistics.addAndGetBytes(batchBytes); + + topicCounter.addToTotalBytes(batchBytes); + topicCounter.addToTotalRecords(recordOffsetObjectList.size()); + + if (LOGGER.isDebugEnabled()) { + LOGGER + .debug( + "Sent batch for <[{}]> with records <{}> and size <{}> KB took <{}> milliseconds. <{}> RPS. <{}> KB/s ", + table, recordOffsetObjectList.size(), batchBytes / 1024, (took), rps, bps / 1024 + ); + } + lastTimeCalled = Instant.now().toEpochMilli(); + } + + private byte[] eventToOrigin() { + byte[] origin; + Fragment originFragment = rfc5424Frame.structuredData.getValue(originHostname); + if (!originFragment.isStub) { + origin = originFragment.toBytes(); + } + else { + origin = new byte[] {}; + } + return origin; + } + + private byte[] eventToSource() { + /*input is produced from SD element event_node_source@48577 by + concatenating "source_module:hostname:source". in case + if event_node_source@48577 is not available use event_node_relay@48577. + If neither are present, use null value.*/ + + sourceConcatenationBuffer.clear(); + + Fragment sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSourceModule); + if (sourceModuleFragment.isStub) { + sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySourceModule); + } + + byte[] source_module; + if (!sourceModuleFragment.isStub) { + source_module = sourceModuleFragment.toBytes(); + } + else { + source_module = new byte[] {}; + } + + Fragment sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceHostname); + if (sourceHostnameFragment.isStub) { + sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeRelayHostname); + } + + byte[] source_hostname; + if (!sourceHostnameFragment.isStub) { + source_hostname = sourceHostnameFragment.toBytes(); + } + else { + source_hostname = new byte[] {}; + } + + Fragment sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSource); + if (sourceHostnameFragment.isStub) { + sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySource); + } + + byte[] source_source; + if (!sourceSourceFragment.isStub) { + source_source = sourceSourceFragment.toBytes(); + } + else { + source_source = new byte[] {}; + } + + sourceConcatenationBuffer.put(source_module); + sourceConcatenationBuffer.put((byte) ':'); + sourceConcatenationBuffer.put(source_hostname); + sourceConcatenationBuffer.put((byte) ':'); + sourceConcatenationBuffer.put(source_source); + + sourceConcatenationBuffer.flip(); + byte[] input = new byte[sourceConcatenationBuffer.remaining()]; + sourceConcatenationBuffer.get(input); + + return input; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java new file mode 100644 index 00000000..7a3fddb6 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java @@ -0,0 +1,98 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.teragrep.cfe_39.Config; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +public class HDFSPrune { + + private static final Logger LOGGER = LoggerFactory.getLogger(HDFSPrune.class); + private final FileSystem fs; + private final Path newDirectoryPath; + private final long cutOffEpoch; + + public HDFSPrune(Config config, String topicName, FileSystem fs) throws IOException { + this.fs = fs; + String path = config.getHdfsPath().concat("/").concat(topicName); + //==== Create directory if not exists + Path workingDir = fs.getWorkingDirectory(); + newDirectoryPath = new Path(path); + if (!fs.exists(newDirectoryPath)) { + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.info("Path <{}> created.", path); + } + long pruneOffset = config.getPruneOffset(); + cutOffEpoch = System.currentTimeMillis() - pruneOffset; // pruneOffset is parametrized in Config.java. Default value is 2 days in milliseconds. + } + + public int prune() throws IOException { + int deleted = 0; + // Fetch the filestatuses of HDFS files. + FileStatus[] fileStatuses = fs.listStatus(new Path(newDirectoryPath + "/")); + if (fileStatuses.length > 0) { + for (FileStatus fileStatus : fileStatuses) { + // Delete old files + if (fileStatus.getModificationTime() < cutOffEpoch) { + boolean delete = fs.delete(fileStatus.getPath(), true); + LOGGER.info("Deleted file <{}>", fileStatus.getPath()); + deleted++; + } + } + } + else { + LOGGER.info("No files found in directory <{}>", new Path(newDirectoryPath + "/")); + } + return deleted; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java new file mode 100644 index 00000000..bf1f92f6 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java @@ -0,0 +1,117 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.teragrep.cfe_39.Config; +import org.apache.hadoop.fs.*; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public final class HDFSRead implements AutoCloseable { + /* Maps out the latest offset for all the topic partitions available in HDFS. + The offset map can then be used for kafka consumer seek() method, which will add the idempotent functionality to the consumer. + Also, because this class should be called outside the loops that generate the consumer groups it should be lightweight to run.*/ + + private static final Logger LOGGER = LoggerFactory.getLogger(HDFSRead.class); + private final FileSystem fs; + private final String path; + + public HDFSRead(Config config, FileSystem fs) throws IOException { + this.fs = fs; + path = config.getHdfsPath(); + } + + public Map hdfsStartOffsets() throws IOException { + Map offsets = new HashMap<>(); + + Path workingDir = fs.getWorkingDirectory(); + Path newDirectoryPath = new Path(path); + if (!fs.exists(newDirectoryPath)) { + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.info("Path <{}> created.", path); + } + + FileStatus[] directoryStatuses = fs.listStatus(new Path(path)); + // Get the directory statuses. Each directory represents a Kafka topic. + if (directoryStatuses.length > 0) { + LOGGER.debug("Found <{}> matching directories", directoryStatuses.length); + for (FileStatus directoryStatus : directoryStatuses) { + // Get the file statuses that are inside the directories. + FileStatus[] fileStatuses = fs.listStatus(directoryStatus.getPath()); + for (FileStatus fileStatus : fileStatuses) { + String topic = fileStatus.getPath().getParent().getName(); + String[] split = fileStatus.getPath().getName().split("\\."); // The file name can be split to partition parameter and offset parameter. First value is partition and second is offset. + String partition = split[0]; + String offset = split[1]; + TopicPartition topicPartition = new TopicPartition(topic, Integer.parseInt(partition)); + if (!offsets.containsKey(topicPartition)) { + offsets.put(topicPartition, Long.parseLong(offset) + 1); + } + else { + if (offsets.get(topicPartition) < Long.parseLong(offset) + 1) { + offsets.replace(topicPartition, Long.parseLong(offset) + 1); + } + } + } + } + } + else { + LOGGER.info("No matching directories found"); + } + return offsets; + } + + // try-with-resources handles closing the filesystem automatically. + public void close() { + // NoOp, as closing the FileSystem object here would also close all the other FileSystem objects. + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java new file mode 100644 index 00000000..c949ee81 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java @@ -0,0 +1,192 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.google.gson.JsonObject; +import com.teragrep.cfe_39.Config; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.util.Properties; + +public class HDFSWrite implements AutoCloseable { + + private static final Logger LOGGER = LoggerFactory.getLogger(HDFSWrite.class); + private final String fileName; + private final String path; + private final FileSystem fs; + private final boolean useMockKafkaConsumer; // Defines if mock HDFS database is used for testing + private final HdfsConfiguration conf; + private final String hdfsuri; + + public HDFSWrite(Config config, JsonObject lastObjectJo) throws IOException { + + Properties readerKafkaProperties = config.getKafkaConsumerProperties(); + this.useMockKafkaConsumer = Boolean + .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")); + + if (useMockKafkaConsumer) { + // Code for initializing the class for mock hdfs database usage without kerberos. + hdfsuri = config.getHdfsuri(); + + /* The filepath should be something like hdfs:///opt/teragrep/cfe_39/srv/topic_name/0.12345 where 12345 is offset and 0 the partition. + In other words the directory named topic_name holds files that are named and arranged based on partition and the partition's offset. Every partition has its own set of unique offset values. + These values should be fetched from config and other input parameters (topic+partition+offset).*/ + path = config.getHdfsPath() + "/" + lastObjectJo.get("topic").getAsString(); + fileName = lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString(); // filename should be constructed from partition and offset. + + // ====== Init HDFS File System Object + conf = new HdfsConfiguration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", LocalFileSystem.class.getName()); + // Set HADOOP user here. + System.setProperty("HADOOP_USER_NAME", "hdfs"); + System.setProperty("hadoop.home.dir", "/"); + // filesystem for HDFS access is set here + try { + fs = FileSystem.get(URI.create(hdfsuri), conf); + } + catch (IOException e) { + throw new RuntimeException(e); + } + + } + else { + // Code for initializing the class for kerberized HDFS database usage. + hdfsuri = config.getHdfsuri(); + + path = config.getHdfsPath() + "/" + lastObjectJo.get("topic").getAsString(); + fileName = lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString(); + + // set kerberos host and realm + System.setProperty("java.security.krb5.realm", config.getKerberosRealm()); + System.setProperty("java.security.krb5.kdc", config.getKerberosHost()); + + conf = new HdfsConfiguration(); + + // enable kerberus + conf.set("hadoop.security.authentication", config.getHadoopAuthentication()); + conf.set("hadoop.security.authorization", config.getHadoopAuthorization()); + conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", config.getKerberosLoginAutorenewal()); + + conf.set("fs.defaultFS", hdfsuri); // Set FileSystem URI + conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); // Maven stuff? + conf.set("fs.file.impl", LocalFileSystem.class.getName()); // Maven stuff? + + // hack for running locally with fake DNS records, set this to true if overriding the host name in /etc/hosts + conf.set("dfs.client.use.datanode.hostname", config.getKerberosTestMode()); + + // server principal, the kerberos principle that the namenode is using + conf.set("dfs.namenode.kerberos.principal.pattern", config.getKerberosPrincipal()); + + // set sasl + conf.set("dfs.data.transfer.protection", config.getDfsDataTransferProtection()); + conf.set("dfs.encrypt.data.transfer.cipher.suites", config.getDfsEncryptDataTransferCipherSuites()); + + // filesystem for HDFS access is set here + fs = FileSystem.get(conf); + } + } + + // Method for committing the AVRO-file to HDFS + public void commit(File syslogFile) { + // The code for writing the file to HDFS should be same for both test (non-kerberized access) and prod (kerberized access). + try { + //==== Create directory if not exists + Path workingDir = fs.getWorkingDirectory(); + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + if (!fs.exists(newDirectoryPath)) { + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.info("Path <{}> created.", path); + } + + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath.toString() + "/" + fileName); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + if (fs.exists(hdfswritepath)) { + LOGGER + .debug( + "Deleting the seemingly duplicate source file {} because target file {} already exists in HDFS", + syslogFile.getPath(), hdfswritepath + ); + syslogFile.delete(); + throw new RuntimeException("File " + fileName + " already exists"); + } + else { + LOGGER.debug("Target file <{}> doesn't exist, proceeding normally.", hdfswritepath); + } + + Path path = new Path(syslogFile.getPath()); + fs.copyFromLocalFile(path, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + boolean delete = syslogFile.delete(); // deletes the avro-file from the local disk now that it has been committed to HDFS. + LOGGER.info("\nFile committed to HDFS, file writepath should be: <{}>\n", hdfswritepath); + + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + // try-with-resources handles closing the filesystem automatically. + public void close() { + /* NoOp + When used here fs.close() doesn't just affect the current class, it affects all the FileSystem objects that were created using FileSystem.get(URI.create(hdfsuri), conf); in different threads.*/ + } + +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java new file mode 100644 index 00000000..305035bd --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java @@ -0,0 +1,272 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.teragrep.cfe_39.Config; +import com.teragrep.cfe_39.metrics.*; +import com.teragrep.cfe_39.metrics.topic.TopicCounter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.PartitionInfo; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.sql.SQLException; +import java.time.Duration; +import java.util.*; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +// Ingests data for HDFS database, periodically scans kafka for new topics based on config.getQueueTopicPattern() and creates kafka topic consumer groups for the new topics that will store the records to HDFS. +public class HdfsDataIngestion { + + private static final Logger LOGGER = LoggerFactory.getLogger(HdfsDataIngestion.class); + private final Config config; + private final org.apache.kafka.clients.consumer.Consumer kafkaConsumer; + private final List threads = new ArrayList<>(); + private final Set activeTopics = new HashSet<>(); + private boolean keepRunning; + private boolean useMockKafkaConsumer; + private final int numOfConsumers; + private Map hdfsStartOffsets; + private final FileSystem fs; + + public HdfsDataIngestion(Config config) throws IOException { + keepRunning = true; + this.config = config; + Properties readerKafkaProperties = config.getKafkaConsumerProperties(); + this.numOfConsumers = config.getNumOfConsumers(); + this.useMockKafkaConsumer = Boolean + .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")); + if (useMockKafkaConsumer) { + this.kafkaConsumer = MockKafkaConsumerFactory.getConsumer(0); // A consumer used only for scanning the available topics to be allocated to consumers running in different threads (thus 0 as input parameter). + // Initializing the FileSystem with minicluster. + String hdfsuri = config.getHdfsuri(); + // ====== Init HDFS File System Object + HdfsConfiguration conf = new HdfsConfiguration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // Set HADOOP user + System.setProperty("HADOOP_USER_NAME", "hdfs"); + System.setProperty("hadoop.home.dir", "/"); + //Get the filesystem - HDFS + fs = FileSystem.get(URI.create(hdfsuri), conf); + } + else { + this.kafkaConsumer = new KafkaConsumer<>( + config.getKafkaConsumerProperties(), + new ByteArrayDeserializer(), + new ByteArrayDeserializer() + ); + // Initializing the FileSystem with kerberos. + String hdfsuri = config.getHdfsuri(); // Get from config. + // set kerberos host and realm + System.setProperty("java.security.krb5.realm", config.getKerberosRealm()); + System.setProperty("java.security.krb5.kdc", config.getKerberosHost()); + HdfsConfiguration conf = new HdfsConfiguration(); + // enable kerberus + conf.set("hadoop.security.authentication", config.getHadoopAuthentication()); + conf.set("hadoop.security.authorization", config.getHadoopAuthorization()); + conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", config.getKerberosLoginAutorenewal()); + conf.set("fs.defaultFS", hdfsuri); // Set FileSystem URI + conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); // Maven stuff? + conf.set("fs.file.impl", LocalFileSystem.class.getName()); // Maven stuff? + /* hack for running locally with fake DNS records + set this to true if overriding the host name in /etc/hosts*/ + conf.set("dfs.client.use.datanode.hostname", config.getKerberosTestMode()); + /* server principal + the kerberos principle that the namenode is using*/ + conf.set("dfs.namenode.kerberos.principal.pattern", config.getKerberosPrincipal()); + // set sasl + conf.set("dfs.data.transfer.protection", config.getDfsDataTransferProtection()); + conf.set("dfs.encrypt.data.transfer.cipher.suites", config.getDfsEncryptDataTransferCipherSuites()); + // set usergroup stuff + UserGroupInformation.setConfiguration(conf); + UserGroupInformation.loginUserFromKeytab(config.getKerberosKeytabUser(), config.getKerberosKeytabPath()); + // filesystem for HDFS access is set here + fs = FileSystem.get(conf); + } + hdfsStartOffsets = new HashMap<>(); + } + + public void run() throws InterruptedException, IOException { + + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + // Generates offsets of the already committed records for Kafka and passes them to the kafka consumers. + try (HDFSRead hr = new HDFSRead(config, fs)) { + hdfsStartOffsets = hr.hdfsStartOffsets(); + LOGGER.debug("topicPartitionStartMap generated succesfully: <{}>", hdfsStartOffsets); + } + catch (IOException e) { + throw new RuntimeException(e); + } + + while (keepRunning) { + if ("kerberos".equals(config.getHadoopAuthentication())) { + UserGroupInformation.getLoginUser().checkTGTAndReloginFromKeytab(); + } + LOGGER.debug("Scanning for threads"); + topicScan(durationStatistics, topicCounters); + + // log stuff + durationStatistics.log(); + long topicScanDelay = 30000L; + Thread.sleep(topicScanDelay); + for (String topic_name : activeTopics) { + LOGGER.info("topic that is being pruned: <{}>", topic_name); + if (topic_name != null) { + try { + HDFSPrune hdfsPrune = new HDFSPrune(config, topic_name, fs); + hdfsPrune.prune(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + } + // For testing purposes only. Stops the run when all the records are consumed from the mockConsumer during test. + if (durationStatistics.getTotalRecords() > 0 & useMockKafkaConsumer) { + LOGGER.debug("Processed all the test records. Closing."); + keepRunning = false; + } + + } + } + + // Creates kafka topic consumer based on input parameters. + private void createReader( + String topic, + List listPartitionInfo, + List topicCounters, + DurationStatistics durationStatistics + ) throws SQLException { + + // Create a new topicCounter object for the topic that has not been added to topicCounters-list yet. + TopicCounter topicCounter = new TopicCounter(topic); + // Add the new topicCounter object to the list. + topicCounters.add(topicCounter); + + /* Every consumer is run in a separate thread. + Consumer group is also handled here, and each consumer of the group runs on separate thread.*/ + int numOfThreads = Math.min(numOfConsumers, listPartitionInfo.size()); // Makes sure that there aren't more consumers than available partitions in the consumer group. + for (int threadId = 1; numOfThreads >= threadId; threadId++) { + Consumer> output = new DatabaseOutput( + config, // Configuration settings + topic, // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + topicCounter // TopicCounter object from metrics + ); + ReadCoordinator readCoordinator = new ReadCoordinator( + topic, + config.getKafkaConsumerProperties(), + output, + hdfsStartOffsets + ); + Thread readThread = new Thread(null, readCoordinator, topic + threadId); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic. + threads.add(readThread); + readThread.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator. + } + + } + + private void topicScan(DurationStatistics durationStatistics, List topicCounters) { + Map> listTopics = kafkaConsumer.listTopics(Duration.ofSeconds(60)); + Pattern topicsRegex = Pattern.compile(config.getQueueTopicPattern()); + // Find the topics available in Kafka based on given QueueTopicPattern, both active and in-active. + Set foundTopics = new HashSet<>(); + Map> foundPartitions = new HashMap<>(); + for (Map.Entry> entry : listTopics.entrySet()) { + Matcher matcher = topicsRegex.matcher(entry.getKey()); + if (matcher.matches()) { + foundTopics.add(entry.getKey()); + foundPartitions.put(entry.getKey(), entry.getValue()); + } + } + if (foundTopics.isEmpty()) { + throw new IllegalStateException("Pattern <[" + config.getQueueTopicPattern() + "]> found no topics."); + } + // subtract currently active topics from found topics + foundTopics.removeAll(activeTopics); + // Subtract currently active partitions from found partitions + for (String topic_name : activeTopics) { + foundPartitions.remove(topic_name); // removes the partitions from the list based on the topic name. + } + + // Activate all the found in-active topics, in other words create consumer groups for all of them using the createReader()-function. + foundPartitions.forEach((k, v) -> { + LOGGER.debug("Activating topic <{}>", k); + try { + createReader(k, v, topicCounters, durationStatistics); + activeTopics.add(k); + durationStatistics.addAndGetThreads(1); + } + catch (SQLException sqlException) { + LOGGER.error("Topic <{}> not activated due to reader creation error: {}", k, sqlException); + } + }); + durationStatistics.report(); + } + +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java new file mode 100644 index 00000000..a60d9d25 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java @@ -0,0 +1,104 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import org.apache.kafka.clients.consumer.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Duration; +import java.util.*; + +public class KafkaReader implements AutoCloseable { + + final Logger LOGGER = LoggerFactory.getLogger(KafkaReader.class); + private Iterator> kafkaRecordsIterator = Collections.emptyIterator(); + private final Consumer kafkaConsumer; + private final java.util.function.Consumer> callbackFunction; + + public KafkaReader( + Consumer kafkaConsumer, + java.util.function.Consumer> callbackFunction + ) { + this.kafkaConsumer = kafkaConsumer; + this.callbackFunction = callbackFunction; + } + + public void read() { + long offset; + if (!kafkaRecordsIterator.hasNext()) { + // still need to consume more, infinitely loop because connection problems may cause return of an empty iterator + ConsumerRecords kafkaRecords = kafkaConsumer.poll(Duration.ofSeconds(60)); + if (kafkaRecords.isEmpty()) { + LOGGER.debug("kafkaRecords empty after poll."); + } + kafkaRecordsIterator = kafkaRecords.iterator(); + } + + List recordOffsetObjectList = new ArrayList<>(); + while (kafkaRecordsIterator.hasNext()) { + ConsumerRecord record = kafkaRecordsIterator.next(); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("adding from offset: <{}>", record.offset()); + } + recordOffsetObjectList + .add(new RecordOffset(record.topic(), record.partition(), record.offset(), record.value())); + } + + if (!recordOffsetObjectList.isEmpty()) { + /* This is the DatabaseOutput.accept() function. + Offset and other required data for HDFS storage are added to the input parameters of the accept() function which processes the consumed record.*/ + callbackFunction.accept(recordOffsetObjectList); + kafkaConsumer.commitSync(); + } + } + + @Override + public void close() { + kafkaConsumer.close(Duration.ofSeconds(60)); + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java new file mode 100644 index 00000000..e5da3a81 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java @@ -0,0 +1,315 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.MockConsumer; +import org.apache.kafka.clients.consumer.OffsetResetStrategy; +import org.apache.kafka.common.PartitionInfo; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.charset.StandardCharsets; +import java.util.*; + +/** + *

Mock Kafka Consumer Factory

Mocked Kafka Consumer factory used for testing. + * + * @since 08/06/2022 + * @author Mikko Kortelainen + */ +@VisibleForTesting +public class MockKafkaConsumerFactory { + + final static private Logger LOGGER = LoggerFactory.getLogger(MockKafkaConsumerFactory.class); + + private MockKafkaConsumerFactory() { + } + + private static void generateEvents(MockConsumer consumer, String topicName, int partition) { + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 0L, + "2022-04-25T07:34:50.804Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.804Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"835bf792-91cf-44e3-976b-518330bb8fd3\" source=\"source\" unixtime=\"1650872090805\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 2L, + "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"1848d8a1-2f08-4a1e-bec4-ff9e6dd92553\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi." + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 3L, + "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"5e1a0398-c2a0-468d-a562-c3bb31f0f853\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi." + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 4L, + "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"6268c3a2-5bda-427f-acce-29416eb445f4\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi." + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 5L, + "2022-04-25T07:34:52.238Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.238Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"b500dcaf-1101-4000-b6b9-bfb052ddbf86\" source=\"source\" unixtime=\"1650872092238\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 6L, + "2022-04-25T07:34:52.239Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.239Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"05363122-51ac-4c0b-a681-f5868081f56d\" source=\"source\" unixtime=\"1650872092239\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 7L, + "2022-04-25T07:34:52.239Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.239Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"7bbcd843-b795-4c14-b4a1-95f5d445cbcd\" source=\"source\" unixtime=\"1650872092239\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 8L, + "2022-04-25T07:34:52.240Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.240Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"2bc0a9f9-237d-4656-b40a-3038aace37f0\" source=\"source\" unixtime=\"1650872092240\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 9L, + "2022-04-25T07:34:52.240Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.240Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"ecf61e8d-e3a7-48ef-9b73-3c5a5243d2e6\" source=\"source\" unixtime=\"1650872092240\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 10L, + "2022-04-25T07:34:52.241Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.241Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"bf101d5a-e816-4f51-b132-97f8e3431f8e\" source=\"source\" unixtime=\"1650872092241\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 11L, + "2022-04-25T07:34:52.241Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.241Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"ef94d9e9-3c44-4892-b5a6-bf361d13ff97\" source=\"source\" unixtime=\"1650872092241\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 12L, + "2022-04-25T07:34:52.242Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.242Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"5bce6e3d-767d-44b4-a044-6c4872f8f2b5\" source=\"source\" unixtime=\"1650872092242\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 13L, + "2022-04-25T07:34:52.243Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:52.243Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"3bb55ce4-0ea7-413a-b403-28b174d7ac99\" source=\"source\" unixtime=\"1650872092243\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 14L, + "2022-04-25T07:34:52.244Z".getBytes(StandardCharsets.UTF_8), + null + ) + ); + consumer + .addRecord( + new ConsumerRecord<>( + topicName, + partition, + 15L, + "2022-04-25T07:34:52.245Z".getBytes(StandardCharsets.UTF_8), + "12>1 2022-04-25T07:34:52.245Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"3bb55ce4-0ea7-413a-b403-28b174d7ac99\" source=\"source\" unixtime=\"1650872092243\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]" + .getBytes(StandardCharsets.UTF_8) + ) + ); + } + + // Can initialize topic scan with all partitions available when the input parameter is 0. Consumer is manually assigned to specific partitions depending on the threadnum parameter. For example on threadnum 1 consumer has odd numbered partitions assigned to it and threadnum 2 has the even numbered partitions. + public static Consumer getConsumer(int threadnum) { + + LOGGER.warn("useMockKafkaConsumer is set, using MockKafkaConsumer"); + int amountofloops = 10; // number of loops for adding partitions/records to the mock consumer topic. Each loop adds a new partition of 14 records. 17777 loops results in file size slightly above 64M. 10 loops is sized at 36,102 bits. + final MockConsumer consumer; + consumer = new MockConsumer<>(OffsetResetStrategy.EARLIEST); + List topicPartitions = new ArrayList<>(); + LinkedHashMap beginningOffsets = new LinkedHashMap<>(); + LinkedHashMap endOffsets = new LinkedHashMap<>(); + List mockPartitionInfo = new ArrayList<>(); + // generate the topic partitions and metadata first + for (int i = 0; i < amountofloops; i++) { + TopicPartition topicPartition = new TopicPartition("testConsumerTopic", i); + topicPartitions.add(topicPartition); + beginningOffsets.put(topicPartition, 0L); + endOffsets.put(topicPartition, 14L); + mockPartitionInfo.add(new PartitionInfo("testConsumerTopic", i, null, null, null)); + } + + if (threadnum == 1) { + List oddTopicPartitions = new ArrayList<>(); + for (TopicPartition a : topicPartitions) { + if (((a.partition() + 1) % 2) == 0) { + oddTopicPartitions.add(a); + } + } + consumer.assign(oddTopicPartitions); // assign + for (TopicPartition a : topicPartitions) { + if (((a.partition() + 1) % 2) == 0) { + generateEvents(consumer, a.topic(), a.partition()); + } + } + } + else if (threadnum == 2) { + List evenTopicPartitions = new ArrayList<>(); + for (TopicPartition a : topicPartitions) { + if (((a.partition() + 1) % 2) != 0) { + evenTopicPartitions.add(a); + } + } + consumer.assign(evenTopicPartitions); // assign + for (TopicPartition a : topicPartitions) { + if (((a.partition() + 1) % 2) != 0) { + generateEvents(consumer, a.topic(), a.partition()); + } + } + } + else { + consumer.assign(topicPartitions); // assign + for (TopicPartition a : topicPartitions) { + generateEvents(consumer, a.topic(), a.partition()); + } + } + + consumer.updateBeginningOffsets(beginningOffsets); + + consumer.updateEndOffsets(endOffsets); + consumer.updatePartitions("testConsumerTopic", mockPartitionInfo); + return consumer; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java new file mode 100644 index 00000000..08fa3f22 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java @@ -0,0 +1,65 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +// Null object design pattern, used to create null offset objects. +public final class NullOffset implements Offset { + + @Override + public boolean isNull() { + return true; + } + + @Override + public byte[] getRecord() { + return new byte[0]; + } + + @Override + public String offsetToJSON() { + return "{\"topic\":\"Not available\", \"partition\":0, \"offset\":0}"; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java new file mode 100644 index 00000000..ada4c4fd --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java @@ -0,0 +1,55 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +public interface Offset { + + boolean isNull(); + + byte[] getRecord(); + + String offsetToJSON(); +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java new file mode 100644 index 00000000..232c83bd --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java @@ -0,0 +1,137 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.function.Consumer; + +public class ReadCoordinator implements Runnable { + + private static final Logger LOGGER = LoggerFactory.getLogger(ReadCoordinator.class); + + private final String queueTopic; + private final Properties readerKafkaProperties; + private final Consumer> callbackFunction; + private boolean run = true; + private final Map hdfsStartOffsets; + + public ReadCoordinator( + String queueTopic, + Properties readerKafkaProperties, + Consumer> callbackFunction, + Map hdfsStartOffsets + ) { + this.queueTopic = queueTopic; + this.readerKafkaProperties = readerKafkaProperties; + this.callbackFunction = callbackFunction; + this.hdfsStartOffsets = hdfsStartOffsets; + } + + private KafkaReader createKafkaReader( + Properties readerKafkaProperties, + String topic, + Consumer> callbackFunction, + boolean useMockKafkaConsumer + ) { + + org.apache.kafka.clients.consumer.Consumer kafkaConsumer; + if (useMockKafkaConsumer) { // Mock kafka consumer is enabled, create mock consumers with assigned partitions that are not overlapping with each other. + String name = Thread.currentThread().getName(); // Use thread name to identify which thread is running the code. + if (Objects.equals(name, "testConsumerTopic1")) { + kafkaConsumer = MockKafkaConsumerFactory.getConsumer(1); // creates a Kafka MockConsumer that has the odd numbered partitions assigned to it. + } + else if (Objects.equals(name, "testConsumerTopic2")) { + kafkaConsumer = MockKafkaConsumerFactory.getConsumer(2); // creates a Kafka MockConsumer that has the even numbered partitions assigned to it. + } + else { + kafkaConsumer = MockKafkaConsumerFactory.getConsumer(0); // Creates a single Kafka MockConsumer that has all the partitions assigned to it. + } + } + else { // Mock kafka consumer is disabled, subscribe method should handle assigning the partitions automatically to the consumer based on group id parameters of readerKafkaProperties. + kafkaConsumer = new KafkaConsumer<>( + readerKafkaProperties, + new ByteArrayDeserializer(), + new ByteArrayDeserializer() + ); + kafkaConsumer.subscribe(Collections.singletonList(topic)); + } + + Set assignment = kafkaConsumer.assignment(); + // Seek the consumer to topic partition offset defined by the latest record that is committed to HDFS. + for (TopicPartition topicPartition : assignment) { + if (hdfsStartOffsets.containsKey(topicPartition)) { + long position = kafkaConsumer.position(topicPartition); + if (position < hdfsStartOffsets.get(topicPartition)) { + kafkaConsumer.seek(topicPartition, hdfsStartOffsets.get(topicPartition)); + } + } + } + + return new KafkaReader(kafkaConsumer, callbackFunction); + } + + // Part or Runnable implementation, called when the thread is started. + @Override + public void run() { + boolean useMockKafkaConsumer = Boolean + .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")); + try ( + KafkaReader kafkaReader = createKafkaReader( + readerKafkaProperties, queueTopic, callbackFunction, useMockKafkaConsumer + ) + ) { + while (run) { + kafkaReader.read(); + } + } + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java new file mode 100644 index 00000000..543a58fd --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java @@ -0,0 +1,78 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +// This is the class for handling the Kafka record topic/partition/offset data that are required for HDFS storage. +public final class RecordOffset implements Offset { + + private final String topic; + private final int partition; + private final long offset; + private final byte[] record; + + public RecordOffset(String topic, int partition, long offset, byte[] record) { + this.topic = topic; + this.partition = partition; + this.offset = offset; + this.record = record; + } + + @Override + public boolean isNull() { + return false; + } + + @Override + public byte[] getRecord() { + return record; + } + + @Override + public String offsetToJSON() { + return String + .format("{\"topic\":\"%s\", \"partition\":%d, \"offset\":%d}", this.topic, this.partition, this.offset); + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java new file mode 100644 index 00000000..3b893d41 --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java @@ -0,0 +1,104 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka; + +import com.teragrep.cfe_39.avro.SyslogRecord; +import org.apache.avro.file.CodecFactory; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.file.SeekableFileInput; +import org.apache.avro.file.SyncableFileOutputStream; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.specific.SpecificDatumWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; + +public class SyslogAvroWriter implements AutoCloseable { + + private static final Logger LOGGER = LoggerFactory.getLogger(SyslogAvroWriter.class); + + private final DatumWriter datumWriter = new SpecificDatumWriter<>(SyslogRecord.class); + + private final SyncableFileOutputStream syncableFileOutputStream; + + private final DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter); + + public SyslogAvroWriter(File syslogFile) throws IOException { + dataFileWriter.setCodec(CodecFactory.snappyCodec()); + + syncableFileOutputStream = new SyncableFileOutputStream(syslogFile); + + syncableFileOutputStream.getChannel().tryLock(); + + if (syslogFile.length() == 0) { + // new file + dataFileWriter.create(SyslogRecord.getClassSchema(), syncableFileOutputStream); + } + else { + // existing file + SeekableFileInput seekableFileInput = new SeekableFileInput(syslogFile); + + // seek to end + syncableFileOutputStream.getChannel().position(syncableFileOutputStream.getChannel().size()); + dataFileWriter.appendTo(seekableFileInput, syncableFileOutputStream); + } + } + + public void write(SyslogRecord syslogRecord) throws IOException { + dataFileWriter.append(syslogRecord); + dataFileWriter.flush(); + // getFileSize() doesn't work properly if dataFileWriter.flush() is not called after appending a new record to the AVRO-file. + } + + public void close() throws IOException { + dataFileWriter.close(); + } + + public long getFileSize() throws IOException { + return syncableFileOutputStream.getChannel().size(); + } +} diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java new file mode 100644 index 00000000..4db4116e --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java @@ -0,0 +1,144 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.consumers.kafka.queue; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.function.BiPredicate; +import java.util.function.ToLongFunction; +import java.util.stream.Stream; + +public class WritableQueue { + + private static final Logger LOGGER = LoggerFactory.getLogger(WritableQueue.class); + + private final Path queueDirectory; + private String queueNamePrefix; + + public WritableQueue(String queueDirectory, String queueNamePrefix) { + this.queueDirectory = Paths.get(queueDirectory); + this.queueNamePrefix = queueNamePrefix; + if (!Files.isDirectory(this.queueDirectory)) { + throw new IllegalArgumentException("Provided path is not a directory <[" + queueDirectory + "]>"); + } + if (!Files.isWritable(this.queueDirectory)) { + throw new IllegalArgumentException("Provided path is not writeable <[" + queueDirectory + "]>"); + } + } + + private File getNextWritableFilename() throws IOException { + + try ( + Stream files = Files.find(queueDirectory, 1, getFileMatcher(queueNamePrefix), FileVisitOption.FOLLOW_LINKS) + ) { + + long sequenceNumber = files.mapToLong(getPathToSequenceNumberFunction()).max().orElse(0); + + long nextSequenceNumber = sequenceNumber + 1; + + // create next + return new File( + queueDirectory.toAbsolutePath() + File.separator + queueNamePrefix + "." + nextSequenceNumber + ); + } + catch (UncheckedIOException uncheckedIOException) { + // just retry, reader modified the directory + return getNextWritableFilename(); + } + } + + public File getNextWritableFile() throws IOException { + if (queueNamePrefix.isEmpty()) { + throw new IOException("No queueNamePrefix set"); + } + else { + return getNextWritableFilename(); + } + } + + public void setQueueNamePrefix(String queueNamePrefix) { + this.queueNamePrefix = queueNamePrefix; + } + + private BiPredicate getFileMatcher(String queueNamePrefix) { + return (path, basicFileAttributes) -> { + if (!path.getFileName().toString().startsWith(queueNamePrefix)) { + return false; + } + else if (path.getFileName().toString().endsWith(".state")) { + return false; + } + else if (!basicFileAttributes.isRegularFile()) { + return false; + } + else { + LOGGER.trace("getFileMatcher returning: <{}>", path); + return true; + } + }; + } + + private ToLongFunction getPathToSequenceNumberFunction() { + return path -> { + String pathString = path.toString(); + + int dotPosition = pathString.lastIndexOf('.'); + + String sequenceNumberString = pathString.substring(dotPosition + 1); + + return Long.parseLong(sequenceNumberString); + }; + } +} diff --git a/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java b/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java new file mode 100644 index 00000000..15454eaf --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java @@ -0,0 +1,146 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.metrics; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.Meter; +import com.codahale.metrics.MetricRegistry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Instant; + +public class DurationStatistics { + + MetricRegistry metricRegistry = new MetricRegistry(); + private static final Logger LOGGER = LoggerFactory.getLogger(DurationStatistics.class); + private Instant lastReportTime = Instant.now(); + private long lastBytes = 0L; + private long lastRecords = 0L; + private final Counter samplingIntervalStat = new Counter(); + private final Meter recordsPerSecondStat = new Meter(); + private final Meter bytesPerSecondStat = new Meter(); + private final Counter records = new Counter(); + private final Counter bytes = new Counter(); + private final Meter threadsStat = new Meter(); + private final Meter bytesStat = new Meter(); + private final Meter recordsStat = new Meter(); + + public MetricRegistry register() { + // Register the different metrics to metricRegistry here. + metricRegistry.register("samplingIntervalStat", samplingIntervalStat); + metricRegistry.register("recordsPerSecondStat", recordsPerSecondStat); + metricRegistry.register("bytesPerSecondStat", bytesPerSecondStat); + metricRegistry.register("records", records); + metricRegistry.register("bytes", bytes); + metricRegistry.register("threadsStat", threadsStat); + metricRegistry.register("bytesStat", bytesStat); + metricRegistry.register("recordsStat", recordsStat); + return metricRegistry; + } + + public void report() { + long currentRecords = addAndGetRecords(0); // gets the total number of records processed during the current loop AND the previous loops. + long currentBytes = addAndGetBytes(0);// gets the total amount of bytes processed during the current loop AND the previous loops. + + // Check if new records were processed + if (currentRecords > lastRecords) { + records.inc(currentRecords - lastRecords); // new records found, adding the number of records to records. + } + else { + long current = records.getCount(); + records.dec(current); // no new records so set the counter back to 0. + } + if (currentBytes > lastBytes) { + bytes.inc(currentBytes - lastBytes); // new records found, adding the number of records to records. + } + else { + long current = bytes.getCount(); + bytes.dec(current); // no new records so set the counter back to 0. + } + + Instant currentTime = Instant.now(); + long took = currentTime.toEpochMilli() - lastReportTime.toEpochMilli(); + samplingIntervalStat.inc(took); + + recordsPerSecondStat.mark(currentRecords - lastRecords); + bytesPerSecondStat.mark(currentBytes - lastBytes); + + // persist + lastReportTime = currentTime; + lastRecords = currentRecords; + lastBytes = currentBytes; + } + + public long getTotalRecords() { + return records.getCount(); + } + + public void log() { + LOGGER + .info( + "## Processed records <{}> and size <{}> KB during <{}> ms / Metrics for the preceding minute: <{}> RPS. <{}> KB/s ", + records.getCount(), bytes.getCount() / 1024, samplingIntervalStat.getCount(), + recordsPerSecondStat.getOneMinuteRate(), bytesPerSecondStat.getOneMinuteRate() / 1024 + ); + samplingIntervalStat.dec(samplingIntervalStat.getCount()); + } + + public long addAndGetThreads(long delta) { + threadsStat.mark(delta); + return threadsStat.getCount(); + } + + public long addAndGetBytes(long delta) { + bytesStat.mark(delta); + return bytesStat.getCount(); + } + + public long addAndGetRecords(long delta) { + recordsStat.mark(delta); + return recordsStat.getCount(); + } +} diff --git a/src/main/java/com/teragrep/cfe_39/metrics/topic/TopicCounter.java b/src/main/java/com/teragrep/cfe_39/metrics/topic/TopicCounter.java new file mode 100644 index 00000000..cd08a71c --- /dev/null +++ b/src/main/java/com/teragrep/cfe_39/metrics/topic/TopicCounter.java @@ -0,0 +1,115 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39.metrics.topic; + +import java.util.concurrent.atomic.AtomicLong; + +public class TopicCounter { + + private final String topicName; + private final AtomicLong totalRecords = new AtomicLong(); + private final AtomicLong totalBytes = new AtomicLong(); + private final AtomicLong recordsPerSecond = new AtomicLong(); + private final AtomicLong bytesPerSecond = new AtomicLong(); + private final AtomicLong kafkaLatency = new AtomicLong(); + private final AtomicLong databaseLatency = new AtomicLong(); + + public TopicCounter(String topicName) { + this.topicName = topicName; + } + + public long getTotalRecords() { + return totalRecords.get(); + } + + public long getTotalBytes() { + return totalBytes.get(); + } + + public long getRecordsPerSecond() { + return recordsPerSecond.get(); + } + + public long getBytesPerSecond() { + return bytesPerSecond.get(); + } + + public String getTopicName() { + return topicName; + } + + public long getKafkaLatency() { + return kafkaLatency.get(); + } + + public long getDatabaseLatency() { + return databaseLatency.get(); + } + + public void addToTotalRecords(long incrementBy) { + totalRecords.addAndGet(incrementBy); + } + + public void addToTotalBytes(long incrementBy) { + totalBytes.addAndGet(incrementBy); + } + + public void setRecordsPerSecond(long rps) { + recordsPerSecond.set(rps); + } + + public void setBytesPerSecond(long bps) { + bytesPerSecond.set(bps); + } + + public void setKafkaLatency(long latency) { + kafkaLatency.set(latency); + } + + public void setDatabaseLatency(long latency) { + databaseLatency.set(latency); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/ConfigTest.java b/src/test/java/com/teragrep/cfe_39/ConfigTest.java new file mode 100644 index 00000000..7fc13bf0 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/ConfigTest.java @@ -0,0 +1,88 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class ConfigTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConfigTest.class); + + @Test + public void validConfigTest() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + Config config = new Config(); + Properties readerKafkaProperties = config.getKafkaConsumerProperties(); + // Test extracting useMockKafkaConsumer value from config. + boolean useMockKafkaConsumer = Boolean + .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")); + Assertions.assertTrue(useMockKafkaConsumer); + LOGGER.debug("useMockKafkaConsumer: {}", useMockKafkaConsumer); + }); + } + + @Test + public void brokenConfigTest() { + // Set system properties to use the broken configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/broken.application.properties"); + // Test if the broken configuration throws the expected exception. + Exception e = Assertions.assertThrows(Exception.class, () -> { + Config config = new Config(); + }); + Assertions.assertEquals("hdfsuri not set", e.getMessage()); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/HdfsTest.java b/src/test/java/com/teragrep/cfe_39/HdfsTest.java new file mode 100644 index 00000000..315e6dac --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/HdfsTest.java @@ -0,0 +1,191 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import com.teragrep.cfe_39.consumers.kafka.HDFSWrite; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Paths; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +// Tests the functionality of the HDFSWrite.java. +public class HdfsTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(HdfsTest.class); + + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Start minicluster and initialize config. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void hdfsWriteTest() { + // This test case is for testing the functionality of the HDFSWrite.java by writing pre-generated AVRO-files to the HDFS database and asserting the results are correct. + assertDoesNotThrow(() -> { + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + + // writer.commit will delete the file that is given as an input argument. Copy the mock files to another directory so the deletion can be asserted properly too. + String pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.9"; + java.nio.file.Path sourceFile = Paths.get(pathname); + java.nio.file.Path targetDir = Paths.get(config.getQueueDirectory()); + java.nio.file.Path targetFile = targetDir.resolve(sourceFile.getFileName()); + Assertions.assertFalse(targetFile.toFile().exists()); + Files.copy(sourceFile, targetFile); + Assertions.assertTrue(targetFile.toFile().exists()); + File avroFile = new File(targetFile.toUri()); + JsonObject recordOffsetJo = JsonParser + .parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}") + .getAsJsonObject(); + try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) { + writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward. + } + Assertions.assertFalse(targetFile.toFile().exists()); + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + + pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.13"; + sourceFile = Paths.get(pathname); + targetDir = Paths.get(config.getQueueDirectory()); + targetFile = targetDir.resolve(sourceFile.getFileName()); + Files.copy(sourceFile, targetFile); + Assertions.assertTrue(targetFile.toFile().exists()); + avroFile = new File(config.getQueueDirectory() + "/0.13"); + recordOffsetJo = JsonParser + .parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":13}") + .getAsJsonObject(); + try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) { + writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward. + } + Assertions.assertFalse(targetFile.toFile().exists()); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + }); + } + + @Test + public void hdfsWriteExceptionTest() { + // This test case is for testing the functionality of the HDFSWrite.java exception handling by trying to write the same file twice and asserting that the proper exception is thrown. + assertDoesNotThrow(() -> { + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + + // writer.commit will delete the source file that is given as an input argument. Copy the mock file to another directory so the deletion of the source file can be asserted properly. + String pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.9"; + java.nio.file.Path sourceFile = Paths.get(pathname); + java.nio.file.Path targetDir = Paths.get(config.getQueueDirectory()); + java.nio.file.Path targetFile = targetDir.resolve(sourceFile.getFileName()); + Assertions.assertFalse(targetFile.toFile().exists()); + Files.copy(sourceFile, targetFile); + + Assertions.assertTrue(targetFile.toFile().exists()); + File avroFile = new File(targetFile.toUri()); + JsonObject recordOffsetJo = JsonParser + .parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}") + .getAsJsonObject(); + try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) { + writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward. + } + Assertions.assertFalse(targetFile.toFile().exists()); + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + + Files.copy(sourceFile, targetFile); + Assertions.assertTrue(targetFile.toFile().exists()); + avroFile = new File(config.getQueueDirectory() + "/0.9"); + recordOffsetJo = JsonParser + .parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}") + .getAsJsonObject(); + HDFSWrite writer = new HDFSWrite(config, recordOffsetJo); + File finalAvroFile = avroFile; + Exception e = Assertions.assertThrows(Exception.class, () -> writer.commit(finalAvroFile)); + Assertions.assertEquals("File 0.9 already exists", e.getMessage()); + writer.close(); + Assertions.assertFalse(targetFile.toFile().exists()); + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java new file mode 100644 index 00000000..cf2fe882 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java @@ -0,0 +1,372 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.avro.SyslogRecord; +import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.condition.DisabledIfSystemProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.net.URI; +import java.nio.file.Files; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class Ingestion0FilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion0FilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @DisabledIfSystemProperty( + named = "skipIngestionTest", + matches = "true" + ) + @Test + public void ingestion0FilesTest() { + /*This test case is for testing the functionality of the ingestion when there are no files already present in the database before starting ingestion. + Maximum file size is set to 30,000 in the config. + Empty HDFS database, 140 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in a single avro-file per partition.*/ + assertDoesNotThrow(() -> { + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct. + config.setMaximumFileSize(30000); // This parameter defines the amount of records that can fit inside a single AVRO-file. + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + Thread.sleep(10000); + hdfsDataIngestion.run(); + }); + + // Assert that the kafka records were ingested correctly and the database holds the correct 140 records. + assertDoesNotThrow(() -> { + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; + Path newDirectoryPath = new Path(path); + Assertions.assertTrue(fs.exists(newDirectoryPath)); + + /* This is the HDFS write path for the files: + Path hdfswritepath = new Path(newDirectoryPath + "/" + fileName); where newDirectoryPath is config.getHdfsPath() + "/" + lastObject.topic; and filename is lastObject.partition+"."+lastObject.offset; + + Create the list of files to read from HDFS. Test setup is created so each of the 0-9 partitions will have 1 file with offset of 13.*/ + List filenameList = new ArrayList<>(); + for (int i = 0; i <= 9; i++) { + filenameList.add(i + "." + 13); + } + FileStatus[] fileStatuses = fs.listStatus(newDirectoryPath); + Assertions.assertEquals(filenameList.size(), fileStatuses.length); + for (FileStatus fileStatus : fileStatuses) { + Assertions.assertTrue(filenameList.contains(fileStatus.getPath().getName())); + } + LOGGER.debug("All expected files present in HDFS."); + + int partitionCounter = 0; + for (String fileName : filenameList) { + //==== Read files + LOGGER.info("Read file into hdfs"); + //Create a path + Path hdfsreadpath = new Path(newDirectoryPath + "/" + fileName); // The path should be the same that was used in writing the file to HDFS. + //Init input stream + FSDataInputStream inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + DataFileStream reader = new DataFileStream<>( + inputStream, + new SpecificDatumReader<>(SyslogRecord.class) + ); + SyslogRecord record = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 0, \"origin\": \"jla-02.default\", \"payload\": \"[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090806000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 1, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 2, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 3, \"origin\": \"jla-02\", \"payload\": \"470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 4, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092238000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 5, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 6, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 7, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 8, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 9, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 10, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 11, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092242000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 12, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092243000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 13, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]\"}", + record.toString() + ); + Assertions.assertFalse(reader.hasNext()); + LOGGER.info("Partition {} passed assertions.", partitionCounter); + partitionCounter++; + inputStream.close(); + } + Assertions.assertEquals(10, partitionCounter); + }); + } + + @DisabledIfSystemProperty( + named = "skipIngestionTest", + matches = "true" + ) + @Test + public void ingestion0FilesLowSizeTest() { + /*This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion. + Maximum file size is set to 3,000 in the config. + Empty HDFS database, 140 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in two avro-files per partition based on MaximumFileSize.*/ + assertDoesNotThrow(() -> { + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct. + config.setMaximumFileSize(3000); // This parameter defines the amount of records that can fit inside a single AVRO-file. + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + Thread.sleep(10000); + hdfsDataIngestion.run(); + }); + + // Assert that the kafka records were ingested correctly and the database holds the correct 140 records. + + // Check that the files were properly written to HDFS. + String hdfsuri = config.getHdfsuri(); + + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // Set HADOOP user + System.setProperty("HADOOP_USER_NAME", "hdfs"); + System.setProperty("hadoop.home.dir", "/"); + //Get the filesystem - HDFS + assertDoesNotThrow(() -> { + fs = FileSystem.get(URI.create(hdfsuri), conf); + + Path workingDir = fs.getWorkingDirectory(); + Path newDirectoryPath = new Path(path); + Assertions.assertTrue(fs.exists(newDirectoryPath)); + + // Assert that the kafka records were ingested correctly and the database holds the expected 20 files. + Assertions + .assertEquals(20, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13"))); + LOGGER.debug("All expected files present in HDFS."); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java new file mode 100644 index 00000000..79174e47 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java @@ -0,0 +1,175 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledIfSystemProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class Ingestion1Old1NewFileTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion1Old1NewFileTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts pre-made avro-files to HDFS where one file has new timestamp and other old, which are normally generated during data ingestion from mock kafka consumer. + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.9"), 157784760000L, -1); + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.13"), -1, -1); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @DisabledIfSystemProperty( + named = "skipIngestionTest", + matches = "true" + ) + @Test + public void ingestion1Old1NewFileTest() { + /* This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion. + 14 records are inserted to HDFS database before starting ingestion, with 124/140 records in mock kafka consumer ready for ingestion. + Partitions through 1 to 9 will have only a single file, partition 0 will have 2 files (0.9 and 0.13). + partition 0 files are pre-made and inserted to the HDFS database with old timestamp for file 0.9 and new for 0.13. + Old files are pruned from the database during ingestion topic scan loops.*/ + + assertDoesNotThrow(() -> { + // Assert the known starting state. + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct. + Assertions.assertTrue((System.currentTimeMillis() - config.getPruneOffset()) > 157784760000L); + config.setMaximumFileSize(30000); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + Thread.sleep(10000); + hdfsDataIngestion.run(); + + // Assert that the kafka records were ingested and pruned correctly and the database holds only the expected 10 files. + Assertions + .assertEquals(10, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java new file mode 100644 index 00000000..7c8e7db8 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java @@ -0,0 +1,596 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.avro.SyslogRecord; +import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledIfSystemProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.net.URI; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class Ingestion2NewFilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion2NewFilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts pre-made avro-files with new timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer. + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @DisabledIfSystemProperty( + named = "skipIngestionTest", + matches = "true" + ) + @Test + public void ingestion2NewFilesTest() { + /* This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion. + 14 records are inserted to HDFS database before starting ingestion, with 124/140 records in mock kafka consumer ready for ingestion. + Partitions through 1 to 9 will have only a single file, partition 0 will have 2 files (0.9 and 0.13) that are inserted to the database before starting ingestion. + */ + assertDoesNotThrow(() -> { + // Assert the known starting state. + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct. + config.setMaximumFileSize(30000); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + Thread.sleep(10000); + hdfsDataIngestion.run(); + + // Assert that the kafka records were ingested correctly and the database holds the expected 11 files holding the expected 140 records. + Assertions + .assertEquals(11, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13"))); + }); + + // Check that the files were properly written to HDFS. + String hdfsuri = config.getHdfsuri(); + + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // Set HADOOP user + System.setProperty("HADOOP_USER_NAME", "hdfs"); + System.setProperty("hadoop.home.dir", "/"); + //Get the filesystem - HDFS + assertDoesNotThrow(() -> { + fs = FileSystem.get(URI.create(hdfsuri), conf); + + Path workingDir = fs.getWorkingDirectory(); + Path newDirectoryPath = new Path(path); + Assertions.assertTrue(fs.exists(newDirectoryPath)); + + /* This is the HDFS write path for the files: + Path hdfswritepath = new Path(newDirectoryPath + "/" + fileName); where newDirectoryPath is config.getHdfsPath() + "/" + lastObject.topic; and filename is lastObject.partition+"."+lastObject.offset; + + Create the list of files to read from HDFS. Test setup is created so each of the 1-9 partitions will have 1 file with offset of 13, while the 0th partition will have 2 files with offset 9 and 13.*/ + List filenameList = new ArrayList<>(); + filenameList.add("0.9"); + filenameList.add("0.13"); + for (int i = 1; i <= 9; i++) { + filenameList.add(i + "." + 13); + } + FileStatus[] fileStatuses = fs.listStatus(newDirectoryPath); + Assertions.assertEquals(filenameList.size(), fileStatuses.length); + for (FileStatus fileStatus : fileStatuses) { + Assertions.assertTrue(filenameList.contains(fileStatus.getPath().getName())); + } + LOGGER.info("All expected files present in HDFS."); + + int partitionCounter = 0; + + // Assertions for file testConsumerTopic/0.9 + String fileName0 = filenameList.get(0); + Assertions.assertEquals("0.9", fileName0); + // Assert that file testConsumerTopic/0.9 has expected content. + LOGGER.info("Read file into hdfs"); + //Create a path + Path hdfsreadpath = new Path(newDirectoryPath + "/" + fileName0); // The path should be the same that was used in writing the file to HDFS. + //Init input stream + FSDataInputStream inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + DataFileStream reader = new DataFileStream<>( + inputStream, + new SpecificDatumReader<>(SyslogRecord.class) + ); + SyslogRecord record = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 0, \"origin\": \"jla-02.default\", \"payload\": \"[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872090806000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 1, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 2, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 3, \"origin\": \"jla-02\", \"payload\": \"470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 4, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092238000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 5, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 6, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 7, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 8, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 9, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]\"}", + record.toString() + ); + + Assertions.assertFalse(reader.hasNext()); // Reached the end of the testConsumerTopic/0.9 file. + inputStream.close(); + filenameList.remove(0); + + // Assertions for file testConsumerTopic/0.13 + fileName0 = filenameList.get(0); + Assertions.assertEquals("0.13", fileName0); + LOGGER.info("Read file into hdfs"); + //Create a path + hdfsreadpath = new Path(newDirectoryPath + "/" + fileName0); // The path should be the same that was used in writing the file to HDFS. + //Init input stream + inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + reader = new DataFileStream<>(inputStream, new SpecificDatumReader<>(SyslogRecord.class)); + record = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 10, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 11, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092242000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 12, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(record.toString()); + } + Assertions + .assertEquals( + "{\"timestamp\": 1650872092243000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 13, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]\"}", + record.toString() + ); + Assertions.assertFalse(reader.hasNext()); // Reached the end of the testConsumerTopic/0.13 file. + inputStream.close(); + filenameList.remove(0); + + partitionCounter++; + + for (String fileName : filenameList) { + //==== Read files + LOGGER.info("Read file into hdfs"); + //Create a path + hdfsreadpath = new Path(newDirectoryPath + "/" + fileName); // The path should be the same that was used in writing the file to HDFS. + //Init input stream + inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + reader = new DataFileStream<>(inputStream, new SpecificDatumReader<>(SyslogRecord.class)); + record = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 0, \"origin\": \"jla-02.default\", \"payload\": \"[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090806000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 1, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 2, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 3, \"origin\": \"jla-02\", \"payload\": \"470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 4, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092238000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 5, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 6, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 7, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 8, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 9, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 10, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 11, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092242000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 12, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]\"}", + record.toString() + ); + + Assertions.assertTrue(reader.hasNext()); + record = reader.next(record); + Assertions + .assertEquals( + "{\"timestamp\": 1650872092243000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"" + + partitionCounter + + "\", \"offset\": 13, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]\"}", + record.toString() + ); + Assertions.assertFalse(reader.hasNext()); + LOGGER.info("Partition {} passed assertions.", partitionCounter); + partitionCounter++; + inputStream.close(); + } + Assertions.assertEquals(10, partitionCounter); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java new file mode 100644 index 00000000..4424918b --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java @@ -0,0 +1,176 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledIfSystemProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class Ingestion2OldFilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion2OldFilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts pre-made avro-files with old timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer. + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.9"), 157784760000L, -1); + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.13"), 157784760000L, -1); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @DisabledIfSystemProperty( + named = "skipIngestionTest", + matches = "true" + ) + @Test + public void ingestion2OldFilesTest() { + /* This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion. + 14 records are inserted to HDFS database before starting ingestion, with 126/140 records in mock kafka consumer ready for ingestion. + Partitions through 1 to 9 will have only a single file, partition 0 will have 2 files (0.9 and 0.13) that are inserted to the database before starting ingestion. + partition 0 files are pre-made and inserted to the HDFS database with old timestamps that will mark them for pruning when ingestion is started.*/ + + assertDoesNotThrow(() -> { + // Assert the known starting state. + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct. + Assertions.assertTrue((System.currentTimeMillis() - config.getPruneOffset()) > 157784760000L); + config.setMaximumFileSize(30000); + HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config); + Thread.sleep(10000); + hdfsDataIngestion.run(); + + // Assert that the kafka records were ingested and pruned correctly and the database holds only the expected 9 files. + Assertions + .assertEquals(9, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions + .assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java b/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java new file mode 100644 index 00000000..0087786a --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java @@ -0,0 +1,651 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.ReadCoordinator; +import com.teragrep.cfe_39.consumers.kafka.RecordOffset; +import com.teragrep.rlo_06.ParseException; +import com.teragrep.rlo_06.RFC5424Frame; +import org.apache.kafka.common.TopicPartition; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.util.*; +import java.util.function.Consumer; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class KafkaConsumerTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(KafkaConsumerTest.class); + + @Test + public void readCoordinatorTest2Threads() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + Config config = new Config(); + Map hdfsStartOffsets = new HashMap<>(); + ArrayList> messages = new ArrayList<>(); + Consumer> output = message -> messages.add(message); + + ReadCoordinator readCoordinator = new ReadCoordinator( + "testConsumerTopic", + config.getKafkaConsumerProperties(), + output, + hdfsStartOffsets + ); + Thread readThread = new Thread(null, readCoordinator, "testConsumerTopic1"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic. + readThread.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator. + + Thread.sleep(1000); + + ReadCoordinator readCoordinator2 = new ReadCoordinator( + "testConsumerTopic", + config.getKafkaConsumerProperties(), + output, + hdfsStartOffsets + ); + Thread readThread2 = new Thread(null, readCoordinator2, "testConsumerTopic2"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic. + readThread2.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator. + + Thread.sleep(10000); + Assertions.assertEquals(2, messages.size()); + Assertions.assertEquals(160, messages.get(0).size() + messages.get(1).size()); // Assert that expected amount of records has been consumed by the consumer group. + Assertions.assertEquals(80, messages.get(0).size()); + Assertions.assertEquals(80, messages.get(1).size()); + + // Assert that all the record contents are correct, every topic partition has identical set of offset-message pairings. + List messageList = new ArrayList(); + messageList.add("[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!"); + messageList.add("[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"); + messageList.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi."); + messageList.add("470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi."); + messageList.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi."); + messageList + .add( + "25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]" + ); + messageList + .add( + "25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]" + ); + + RFC5424Frame rfc5424Frame = new RFC5424Frame(false); + + RecordOffset recordOffset; + + Iterator iterator = messageList.iterator(); + int counter = 0; + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + ParseException e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + Assertions.assertEquals(80, counter); + + counter = 0; + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + iterator = messageList.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + i + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + 14 + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(1).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + 15 + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + + Assertions.assertEquals(80, counter); + + }); + } + + @Test + public void readCoordinatorTest1Thread() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + Config config = new Config(); + Map hdfsStartOffsets = new HashMap<>(); + ArrayList> messages = new ArrayList<>(); + Consumer> output = message -> messages.add(message); + + ReadCoordinator readCoordinator = new ReadCoordinator( + "testConsumerTopic", + config.getKafkaConsumerProperties(), + output, + hdfsStartOffsets + ); + Thread readThread = new Thread(null, readCoordinator, "testConsumerTopic0"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic. + readThread.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator. + + Thread.sleep(10000); + Assertions.assertEquals(1, messages.size()); + Assertions.assertEquals(160, messages.get(0).size()); // Assert that expected amount of records has been consumed by the consumer. + + // Assert that all the record contents are correct, every topic partition has identical set of offset-message pairings. + List list = new ArrayList(); + list.add("[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!"); + list.add("[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"); + list.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi."); + list.add("470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi."); + list.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi."); + list + .add( + "25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]" + ); + list + .add( + "25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]" + ); + + RFC5424Frame rfc5424Frame = new RFC5424Frame(false); + RecordOffset recordOffset; + Iterator iterator; + List partitionList = new ArrayList(); + partitionList.add(7); + partitionList.add(8); + partitionList.add(5); + partitionList.add(6); + partitionList.add(3); + partitionList.add(4); + partitionList.add(1); + partitionList.add(2); + partitionList.add(0); + partitionList.add(9); + int counter = 0; + for (int partition : partitionList) { + iterator = list.iterator(); + for (int i = 0; i <= 13; i++) { + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + i + + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + Assertions.assertTrue(rfc5424Frame.next()); + Assertions.assertTrue(iterator.hasNext()); + Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString()); + Assertions.assertFalse(rfc5424Frame.next()); + counter++; + } + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + 14 + + "}", + recordOffset.offsetToJSON() + ); + Assertions.assertNull(recordOffset.getRecord()); + counter++; + + recordOffset = messages.get(0).get(counter); + Assertions + .assertEquals( + "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + 15 + + "}", + recordOffset.offsetToJSON() + ); + rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord())); + ParseException e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next); + Assertions.assertEquals("PRIORITY < missing", e.getMessage()); + counter++; + } + + Assertions.assertEquals(160, counter); // All 160 records were asserted. + + }); + } + +} diff --git a/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java b/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java new file mode 100644 index 00000000..63d83e8c --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java @@ -0,0 +1,194 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.DatabaseOutput; +import com.teragrep.cfe_39.consumers.kafka.RecordOffset; +import com.teragrep.cfe_39.metrics.DurationStatistics; +import com.teragrep.cfe_39.metrics.topic.TopicCounter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +// Tests for processing of consumed kafka records with skipping of broken records disabled (both null and non rfc5424). +public class ProcessingFailureTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProcessingFailureTest.class); + + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration with skipping of broken records disabled. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/failProcessing.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void failNonRFC5424DatabaseOutputTest() { + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + assertDoesNotThrow(() -> { + + Consumer> output = new DatabaseOutput( + config, // Configuration settings + "topicName", // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + new TopicCounter("topicName") // TopicCounter object from metrics + ); + + ConsumerRecord record = new ConsumerRecord<>( + "topicName", + 0, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + "12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!" + .getBytes(StandardCharsets.UTF_8) + ); + RecordOffset recordOffsetObject = new RecordOffset( + record.topic(), + record.partition(), + record.offset(), + record.value() + ); + + List recordOffsetObjectList = new ArrayList<>(); + recordOffsetObjectList.add(recordOffsetObject); + Exception e = Assertions.assertThrows(Exception.class, () -> output.accept(recordOffsetObjectList)); + Assertions.assertEquals("com.teragrep.rlo_06.PriorityParseException: PRIORITY < missing", e.getMessage()); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"))); + // No files stored to hdfs. + }); + + } + + @Test + public void failNullRFC5424DatabaseOutputTest() { + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + assertDoesNotThrow(() -> { + + Consumer> output = new DatabaseOutput( + config, // Configuration settings + "topicName", // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + new TopicCounter("topicName") // TopicCounter object from metrics + ); + + ConsumerRecord record = new ConsumerRecord<>( + "topicName", + 0, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + null + ); + RecordOffset recordOffsetObject = new RecordOffset( + record.topic(), + record.partition(), + record.offset(), + record.value() + ); + + List recordOffsetObjectList = new ArrayList<>(); + recordOffsetObjectList.add(recordOffsetObject); + NullPointerException e = Assertions + .assertThrows(NullPointerException.class, () -> output.accept(recordOffsetObjectList)); + Assertions.assertEquals("Record with null content detected during processing.", e.getMessage()); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"))); + // No files stored to hdfs. + }); + + } +} diff --git a/src/test/java/com/teragrep/cfe_39/ProcessingTest.java b/src/test/java/com/teragrep/cfe_39/ProcessingTest.java new file mode 100644 index 00000000..17820c7f --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/ProcessingTest.java @@ -0,0 +1,311 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.avro.SyslogRecord; +import com.teragrep.cfe_39.consumers.kafka.DatabaseOutput; +import com.teragrep.cfe_39.consumers.kafka.RecordOffset; +import com.teragrep.cfe_39.metrics.DurationStatistics; +import com.teragrep.cfe_39.metrics.topic.TopicCounter; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +// Tests for processing of consumed kafka records with skipping of broken records enabled (both null and non rfc5424). +public class ProcessingTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProcessingTest.class); + + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + }); + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(() -> { + fs.close(); + }); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void skipNonRFC5424DatabaseOutputTest() { + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + assertDoesNotThrow(() -> { + + Consumer> output = new DatabaseOutput( + config, // Configuration settings + "topicName", // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + new TopicCounter("topicName") // TopicCounter object from metrics + ); + + ConsumerRecord record = new ConsumerRecord<>( + "topicName", + 0, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + "12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!" + .getBytes(StandardCharsets.UTF_8) + ); + RecordOffset recordOffsetObject = new RecordOffset( + record.topic(), + record.partition(), + record.offset(), + record.value() + ); + + List recordOffsetObjectList = new ArrayList<>(); + recordOffsetObjectList.add(recordOffsetObject); + output.accept(recordOffsetObjectList); + Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"))); + // File in hdfs does not contain any records, but acts as a marker for kafka consumer offsets. + + // Assert that the file in hdfs contains the expected zero record. + + Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"); + //Init input stream + FSDataInputStream inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + DataFileStream reader = new DataFileStream<>( + inputStream, + new SpecificDatumReader<>(SyslogRecord.class) + ); + SyslogRecord syslogRecord = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertFalse(reader.hasNext()); + }); + + } + + @Test + public void skipNullRFC5424DatabaseOutputTest() { + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + assertDoesNotThrow(() -> { + + Consumer> output = new DatabaseOutput( + config, // Configuration settings + "topicName", // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + new TopicCounter("topicName") // TopicCounter object from metrics + ); + + ConsumerRecord record = new ConsumerRecord<>( + "topicName", + 0, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + null + ); + RecordOffset recordOffsetObject = new RecordOffset( + record.topic(), + record.partition(), + record.offset(), + record.value() + ); + + List recordOffsetObjectList = new ArrayList<>(); + recordOffsetObjectList.add(recordOffsetObject); + output.accept(recordOffsetObjectList); + Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"))); + // File in hdfs does not contain any records, but acts as a marker for kafka consumer offsets. + + // Assert that the file in hdfs contains the expected zero record. + + Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1"); + //Init input stream + FSDataInputStream inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + DataFileStream reader = new DataFileStream<>( + inputStream, + new SpecificDatumReader<>(SyslogRecord.class) + ); + SyslogRecord syslogRecord = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertFalse(reader.hasNext()); + }); + + } + + @Test + public void skipNullAndNonRFC5424DatabaseOutputTest() { + // Initialize and register duration statistics + DurationStatistics durationStatistics = new DurationStatistics(); + durationStatistics.register(); + + // register per topic counting + List topicCounters = new CopyOnWriteArrayList<>(); + + assertDoesNotThrow(() -> { + + Consumer> output = new DatabaseOutput( + config, // Configuration settings + "topicName", // String, the name of the topic + durationStatistics, // RuntimeStatistics object from metrics + new TopicCounter("topicName") // TopicCounter object from metrics + ); + + List recordOffsetObjectList = new ArrayList<>(); + + ConsumerRecord record = new ConsumerRecord<>( + "topicName", + 0, + 1L, + "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8), + null + ); + RecordOffset recordOffsetObject = new RecordOffset( + record.topic(), + record.partition(), + record.offset(), + record.value() + ); + recordOffsetObjectList.add(recordOffsetObject); + + record = new ConsumerRecord<>( + "topicName", + 0, + 2L, + "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8), + "12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!" + .getBytes(StandardCharsets.UTF_8) + ); + recordOffsetObject = new RecordOffset(record.topic(), record.partition(), record.offset(), record.value()); + recordOffsetObjectList.add(recordOffsetObject); + record = new ConsumerRecord<>( + "topicName", + 0, + 3L, + "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8), + "<12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!" + .getBytes(StandardCharsets.UTF_8) + ); + recordOffsetObject = new RecordOffset(record.topic(), record.partition(), record.offset(), record.value()); + recordOffsetObjectList.add(recordOffsetObject); + output.accept(recordOffsetObjectList); + Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.3"))); + + // Assert that the file in hdfs contains the expected single record. + + Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.3"); + //Init input stream + FSDataInputStream inputStream = fs.open(hdfsreadpath); + //The data is in AVRO-format, so it can't be read as a string. + DataFileStream reader = new DataFileStream<>( + inputStream, + new SpecificDatumReader<>(SyslogRecord.class) + ); + SyslogRecord syslogRecord = null; + LOGGER.info("\nReading records from file {}:", hdfsreadpath); + + Assertions.assertTrue(reader.hasNext()); + syslogRecord = reader.next(syslogRecord); + Assertions + .assertEquals( + "{\"timestamp\": 1650872090807000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"0\", \"offset\": 3, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}", + syslogRecord.toString() + ); + Assertions.assertFalse(reader.hasNext()); + + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java new file mode 100644 index 00000000..f89603d3 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java @@ -0,0 +1,112 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningNoFilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningNoFilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Start minicluster and initialize config. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void noFiles() { + // This test case is for testing the functionality of the HDFSPrune.java when the target database is empty. + assertDoesNotThrow(() -> { + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(0, deleted); + Assertions + .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java new file mode 100644 index 00000000..bcd06660 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java @@ -0,0 +1,149 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningOneNewFileTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneNewFileTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts a single pre-made avro-file with a new timestamp to HDFS, which is normally generated during data ingestion from mock kafka consumer. + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + String fileName = "0.9"; + Assertions.assertTrue(listOfFiles.contains(fileName)); + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + fs.setTimes(hdfswritepath, -1, -1); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void oneNewFileTest() { + // This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that shouldn't trigger pruning of old files. + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging. + Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L); + + assertDoesNotThrow(() -> { + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(0, deleted); + // Also check with HDFS access if expected files still exist. + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java new file mode 100644 index 00000000..0e7445f3 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java @@ -0,0 +1,149 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningOneOldFileTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneOldFileTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts a single pre-made avro-file with an olf timestamp to HDFS, which is normally generated during data ingestion from mock kafka consumer. + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + String fileName = "0.9"; + Assertions.assertTrue(listOfFiles.contains(fileName)); + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + fs.setTimes(hdfswritepath, 157784760000L, -1); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void oneOldFileTest() { + // This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that should trigger pruning of old files. + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging. + Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L); + + assertDoesNotThrow(() -> { + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(1, deleted); + // Also check with HDFS access if expected files still exist. + Assertions + .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java new file mode 100644 index 00000000..483e36dc --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java @@ -0,0 +1,155 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningOneOldOneNewFileTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneOldOneNewFileTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + /* Inserts pre-made avro-files to HDFS, which are normally generated during data ingestion from mock kafka consumer. + One file has new timestamp and another old timestamp.*/ + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.9"), 157784760000L, -1); + fs.setTimes(new Path("hdfs:/opt/teragrep/cfe_39/srv/testConsumerTopic/0.13"), -1, -1); + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void oneOldOneNewFileTest() { + /* This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that shouldn't trigger pruning of old files and another file that should trigger the pruning. + The file with newer timestamp is ignored while the older is deleted from the database.*/ + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging. + Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L); + + assertDoesNotThrow(() -> { + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(1, deleted); + // Also check with HDFS access if expected files still exist. + Assertions + .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java new file mode 100644 index 00000000..0f3b450c --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java @@ -0,0 +1,151 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningTwoNewFilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningTwoNewFilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts pre-made avro-files with new timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer. + + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void twoNewFilesTest() { + // This test case is for testing the functionality of the HDFSPrune.java when the database holds two files with a timestamp that shouldn't trigger pruning of old files. + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging. + + assertDoesNotThrow(() -> { + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(0, deleted); + // Also check with HDFS access if expected files still exist. + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java new file mode 100644 index 00000000..0a5ae764 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java @@ -0,0 +1,154 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import com.teragrep.cfe_39.consumers.kafka.HDFSPrune; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.nio.file.Files; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +public class PruningTwoOldFilesTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(PruningTwoOldFilesTest.class); + private static MiniDFSCluster hdfsCluster; + private static File baseDir; + private static Config config; + private FileSystem fs; + + // Prepares known state for testing. + @BeforeEach + public void startMiniCluster() { + assertDoesNotThrow(() -> { + // Set system properties to use the valid configuration. + System + .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"); + config = new Config(); + // Create a HDFS miniCluster + baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile(); + hdfsCluster = new TestMiniClusterFactory().create(config, baseDir); + fs = new TestFileSystemFactory().create(config.getHdfsuri()); + + // Inserts pre-made avro-files with old timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer. + + String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic" + // Sets the directory where the data should be stored, if the directory doesn't exist then it's created. + Path newDirectoryPath = new Path(path); + // Create new Directory + fs.mkdirs(newDirectoryPath); + LOGGER.debug("Path {} created.", path); + String dir = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles"; + Set listOfFiles = Stream + .of(Objects.requireNonNull(new File(dir).listFiles())) + .filter(file -> !file.isDirectory()) + .map(File::getName) + .collect(Collectors.toSet()); + // Loop through all the avro files + for (String fileName : listOfFiles) { + String pathname = dir + "/" + fileName; + File avroFile = new File(pathname); + //==== Write file + LOGGER.debug("Begin Write file into hdfs"); + //Create a path + Path hdfswritepath = new Path(newDirectoryPath + "/" + avroFile.getName()); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset. + Assertions.assertFalse(fs.exists(hdfswritepath)); + Path readPath = new Path(avroFile.getPath()); + fs.copyFromLocalFile(readPath, hdfswritepath); + fs.setTimes(hdfswritepath, 157784760000L, -1); + LOGGER.debug("End Write file into hdfs"); + LOGGER.debug("\nFile committed to HDFS, file writepath should be: {}\n", hdfswritepath); + } + }); + + } + + // Teardown the minicluster + @AfterEach + public void teardownMiniCluster() { + assertDoesNotThrow(fs::close); + hdfsCluster.shutdown(); + FileUtil.fullyDelete(baseDir); + } + + @Test + public void twoOldFilesTest() { + // This test case is for testing the functionality of the HDFSPrune.java when the database holds two files with a timestamp that should trigger pruning of old files. + Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging. + Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L); + + assertDoesNotThrow(() -> { + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic"))); + Assertions + .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs); + int deleted = hdfsPrune.prune(); + Assertions.assertEquals(2, deleted); + // Also check with HDFS access if expected files still exist. + Assertions + .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length); + Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9"))); + Assertions + .assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13"))); + }); + } +} diff --git a/src/test/java/com/teragrep/cfe_39/TestFileSystemFactory.java b/src/test/java/com/teragrep/cfe_39/TestFileSystemFactory.java new file mode 100644 index 00000000..af8ff781 --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/TestFileSystemFactory.java @@ -0,0 +1,71 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.net.URI; + +public class TestFileSystemFactory { + + public FileSystem create(String hdfsURI) throws IOException { + FileSystem fs; + // ====== Init HDFS File System Object + Configuration fsConf = new Configuration(); + // Set FileSystem URI + fsConf.set("fs.defaultFS", hdfsURI); + // Because of Maven + fsConf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + fsConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // Set HADOOP user + System.setProperty("HADOOP_USER_NAME", "hdfs"); + System.setProperty("hadoop.home.dir", "/"); + fs = FileSystem.get(URI.create(hdfsURI), fsConf); + return fs; + } +} diff --git a/src/test/java/com/teragrep/cfe_39/TestMiniClusterFactory.java b/src/test/java/com/teragrep/cfe_39/TestMiniClusterFactory.java new file mode 100644 index 00000000..e7fbfb8f --- /dev/null +++ b/src/test/java/com/teragrep/cfe_39/TestMiniClusterFactory.java @@ -0,0 +1,70 @@ +/* + * HDFS Data Ingestion for PTH_06 use CFE-39 + * Copyright (C) 2021-2024 Suomen Kanuuna Oy + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * + * Additional permission under GNU Affero General Public License version 3 + * section 7 + * + * If you modify this Program, or any covered work, by linking or combining it + * with other code, such other code is not for that reason alone subject to any + * of the requirements of the GNU Affero GPL version 3 as long as this Program + * is the same Program as licensed from Suomen Kanuuna Oy without any additional + * modifications. + * + * Supplemented terms under GNU Affero General Public License version 3 + * section 7 + * + * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified + * versions must be marked as "Modified version of" The Program. + * + * Names of the licensors and authors may not be used for publicity purposes. + * + * No rights are granted for use of trade names, trademarks, or service marks + * which are in The Program if any. + * + * Licensee must indemnify licensors and authors for any liability that these + * contractual assumptions impose on licensors and authors. + * + * To the extent this program is licensed as part of the Commercial versions of + * Teragrep, the applicable Commercial License may apply to this file if you as + * a licensee so wish it. + */ +package com.teragrep.cfe_39; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; + +import java.io.File; +import java.io.IOException; + +// Helper class for creating FileSystem objects. +public class TestMiniClusterFactory { + + public MiniDFSCluster create(Config config, File baseDir) throws IOException { + MiniDFSCluster hdfsCluster; + // Create a HDFS miniCluster + Configuration conf = new Configuration(); + conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); + MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); + hdfsCluster = builder.build(); + String hdfsURI = "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/"; + config.setHdfsuri(hdfsURI); + DistributedFileSystem fileSystem = hdfsCluster.getFileSystem(); + return hdfsCluster; + } +} diff --git a/src/test/resources/broken.application.properties b/src/test/resources/broken.application.properties new file mode 100644 index 00000000..433eee07 --- /dev/null +++ b/src/test/resources/broken.application.properties @@ -0,0 +1,44 @@ +# What topics are searched from kafka, regex +queueTopicPattern=^testConsumerTopic-*$ +# Number of consumers created to the consumer groups +numOfConsumers=2 +# Kafka bootstrap servers +consumer.bootstrap.servers=test +# Offset, should not be touched +consumer.auto.offset.reset=earliest +# Autocommit, should not be touched +consumer.enable.auto.commit=false +# Consumer group id, this is to track the progress of reading hte topic +consumer.group.id=cfe_39 +# Used security protocol and mechanism +consumer.security.protocol=SASL_PLAINTEXT +consumer.sasl.mechanism=PLAIN +# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger +consumer.max.poll.records=500 +# How much data can be fetched in one go +consumer.fetch.max.bytes=1073741820 +# How long for request before timing out. Note that too big max poll records size can cause this to trigger +consumer.request.timeout.ms=300000 +consumer.max.poll.interval.ms=300000 +# For testing only, remove for prod. +consumer.useMockKafkaConsumer=true +# The maximum file size for AVRO-files that are to be stored in HDFS database. +maximumFileSize=3000 +# Boolean for deciding if records not in RFC5424 should be skipped or not. +skipNonRFC5424Records=true +# Boolean for deciding if empty RFC5424 records should be skipped or not. +skipEmptyRFC5424Records=true +# HDFS pruning, use 157784760000 value while testing HDFS writes to ensure the test records are not pruned. 157784760000L +pruneOffset=157784760000 +# Kerberos +java.security.krb5.kdc=test +java.security.krb5.realm=test +hadoop.security.authentication=test +hadoop.security.authorization=test +dfs.namenode.kerberos.principal.pattern=test +KerberosKeytabUser=test +KerberosKeytabPath=test +dfs.client.use.datanode.hostname=false +kerberosLoginAutorenewal=true +dfs.data.transfer.protection=test +dfs.encrypt.data.transfer.cipher.suites=test \ No newline at end of file diff --git a/src/test/resources/failProcessing.application.properties b/src/test/resources/failProcessing.application.properties new file mode 100644 index 00000000..55bc98d2 --- /dev/null +++ b/src/test/resources/failProcessing.application.properties @@ -0,0 +1,46 @@ +# What topics are searched from kafka, regex +queueTopicPattern=^testConsumerTopic-*$ +# Number of consumers created to the consumer groups +numOfConsumers=2 +# Kafka bootstrap servers +consumer.bootstrap.servers=test +# Offset, should not be touched +consumer.auto.offset.reset=earliest +# Autocommit, should not be touched +consumer.enable.auto.commit=false +# Consumer group id, this is to track the progress of reading hte topic +consumer.group.id=cfe_39 +# Used security protocol and mechanism +consumer.security.protocol=SASL_PLAINTEXT +consumer.sasl.mechanism=PLAIN +# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger +consumer.max.poll.records=500 +# How much data can be fetched in one go +consumer.fetch.max.bytes=1073741820 +# How long for request before timing out. Note that too big max poll records size can cause this to trigger +consumer.request.timeout.ms=300000 +consumer.max.poll.interval.ms=300000 +# For testing only, remove for prod. +consumer.useMockKafkaConsumer=true +# The maximum file size for AVRO-files that are to be stored in HDFS database. +maximumFileSize=3000 +# Boolean for deciding if records not in RFC5424 should be skipped or not. +skipNonRFC5424Records=false +# Boolean for deciding if empty RFC5424 records should be skipped or not. +skipEmptyRFC5424Records=false +# HDFS pruning, use 157784760000 value while testing HDFS writes to ensure the test records are not pruned. 157784760000L +pruneOffset=157784760000 +# HDFS uri +hdfsuri=hdfs://localhost:45937/ +# Kerberos +java.security.krb5.kdc=test +java.security.krb5.realm=test +hadoop.security.authentication=test +hadoop.security.authorization=test +dfs.namenode.kerberos.principal.pattern=test +KerberosKeytabUser=test +KerberosKeytabPath=test +dfs.client.use.datanode.hostname=false +kerberosLoginAutorenewal=true +dfs.data.transfer.protection=test +dfs.encrypt.data.transfer.cipher.suites=test \ No newline at end of file diff --git a/src/test/resources/mockHdfsFiles/0.13 b/src/test/resources/mockHdfsFiles/0.13 new file mode 100644 index 00000000..553f5957 Binary files /dev/null and b/src/test/resources/mockHdfsFiles/0.13 differ diff --git a/src/test/resources/mockHdfsFiles/0.9 b/src/test/resources/mockHdfsFiles/0.9 new file mode 100644 index 00000000..c98cda54 Binary files /dev/null and b/src/test/resources/mockHdfsFiles/0.9 differ diff --git a/src/test/resources/valid.application.properties b/src/test/resources/valid.application.properties new file mode 100644 index 00000000..acbcf93d --- /dev/null +++ b/src/test/resources/valid.application.properties @@ -0,0 +1,46 @@ +# What topics are searched from kafka, regex +queueTopicPattern=^testConsumerTopic-*$ +# Number of consumers created to the consumer groups +numOfConsumers=2 +# Kafka bootstrap servers +consumer.bootstrap.servers=test +# Offset, should not be touched +consumer.auto.offset.reset=earliest +# Autocommit, should not be touched +consumer.enable.auto.commit=false +# Consumer group id, this is to track the progress of reading hte topic +consumer.group.id=cfe_39 +# Used security protocol and mechanism +consumer.security.protocol=SASL_PLAINTEXT +consumer.sasl.mechanism=PLAIN +# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger +consumer.max.poll.records=500 +# How much data can be fetched in one go +consumer.fetch.max.bytes=1073741820 +# How long for request before timing out. Note that too big max poll records size can cause this to trigger +consumer.request.timeout.ms=300000 +consumer.max.poll.interval.ms=300000 +# For testing only, remove for prod. +consumer.useMockKafkaConsumer=true +# The maximum file size for AVRO-files that are to be stored in HDFS database. +maximumFileSize=3000 +# Boolean for deciding if records not in RFC5424 should be skipped or not. +skipNonRFC5424Records=true +# Boolean for deciding if empty RFC5424 records should be skipped or not. +skipEmptyRFC5424Records=true +# HDFS pruning, use 157784760000 value while testing HDFS writes to ensure the test records are not pruned. 157784760000L +pruneOffset=157784760000 +# HDFS uri +hdfsuri=hdfs://localhost:45937/ +# Kerberos +java.security.krb5.kdc=test +java.security.krb5.realm=test +hadoop.security.authentication=test +hadoop.security.authorization=test +dfs.namenode.kerberos.principal.pattern=test +KerberosKeytabUser=test +KerberosKeytabPath=test +dfs.client.use.datanode.hostname=false +kerberosLoginAutorenewal=true +dfs.data.transfer.protection=test +dfs.encrypt.data.transfer.cipher.suites=test \ No newline at end of file