diff --git a/pom.xml b/pom.xml
index bda84206..dc6f512d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,6 +8,7 @@
cfe_39
-SNAPSHOT
+ 1.0.0
3.3.6
1.8
1.8
@@ -78,6 +79,12 @@
rlo_06
9.0.1
+
+
+ com.teragrep
+ cnf_01
+ ${cnf_01.version}
+
org.apache.kafka
@@ -199,9 +206,14 @@
src/test/resources/broken.application.properties
src/test/resources/valid.application.properties
src/test/resources/failProcessing.application.properties
+ src/test/resources/largeFile.application.properties
rpm/resources/config.jaas
rpm/resources/log4j2.properties
rpm/resources/application.properties
+ rpm/resources/ingress.properties
+ rpm/resources/egress.properties
+ src/test/resources/valid.hdfs.properties
+ src/test/resources/valid.kafka.properties
rpm/resources/cfe_39.service
rpm/rpm.pom.xml
src/main/java/com/teragrep/cfe_39/avro/SyslogRecord.java
@@ -380,6 +392,218 @@
+
+ org.apache.maven.plugins
+ maven-checkstyle-plugin
+ 3.5.0
+
+
+
+ scan-errors
+
+ check
+
+ process-classes
+
+ error
+ true
+ true
+ false
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ scan-warnings
+
+ check
+
+ process-classes
+
+ warning
+ true
+ false
+ false
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/rpm/resources/application.properties b/rpm/resources/application.properties
index fc5100cb..816fbffa 100644
--- a/rpm/resources/application.properties
+++ b/rpm/resources/application.properties
@@ -1,52 +1,16 @@
-# Kafka security configuration file
-java.security.auth.login.config=/opt/teragrep/cfe_39/etc/config.jaas
# Logger settings
log4j2.configurationFile=/opt/teragrep/cfe_39/etc/log4j2.properties
+# hdfs settings
+egress.configurationFile=/opt/teragrep/cfe_39/etc/egress.properties
+# kafka settings
+ingress.configurationFile=/opt/teragrep/cfe_39/etc/ingress.properties
# What topics are searched from kafka, regex
queueTopicPattern=^testConsumerTopic-*$
-# Number of consumers created to the consumer groups
-numOfConsumers=2
-# Kafka bootstrap servers
-consumer.bootstrap.servers=test
-# Offset, should not be touched
-consumer.auto.offset.reset=earliest
-# Autocommit, should not be touched
-consumer.enable.auto.commit=false
-# Consumer group id, this is to track the progress of reading hte topic
-consumer.group.id=cfe_39
-# Used security protocol and mechanism
-consumer.security.protocol=SASL_PLAINTEXT
-consumer.sasl.mechanism=PLAIN
-# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger
-consumer.max.poll.records=500
-# How much data can be fetched in one go
-consumer.fetch.max.bytes=1073741820
-# How long for request before timing out. Note that too big max poll records size can cause this to trigger
-consumer.request.timeout.ms=300000
-consumer.max.poll.interval.ms=300000
-# For testing only, remove for prod.
-consumer.useMockKafkaConsumer=true
# Directory where AVRO files are constructed for HDFS
queueDirectory=/opt/teragrep/cfe_39/etc/AVRO/
-# The maximum file size for AVRO-files that are to be stored in HDFS database.
-maximumFileSize=60800000
# Boolean for deciding if records not in RFC5424 should be skipped or not.
skipNonRFC5424Records=true
# Boolean for deciding if empty RFC5424 records should be skipped or not.
skipEmptyRFC5424Records=true
-# HDFS pruning offset, prunes files older than the given milliseconds.
-pruneOffset=172800000
-# HDFS uri
-hdfsuri=hdfs://localhost:45937/
-# Kerberos
-java.security.krb5.kdc=test
-java.security.krb5.realm=test
-hadoop.security.authentication=test
-hadoop.security.authorization=test
-dfs.namenode.kerberos.principal.pattern=test
-KerberosKeytabUser=test
-KerberosKeytabPath=test
-dfs.client.use.datanode.hostname=false
-kerberosLoginAutorenewal=true
-dfs.data.transfer.protection=test
-dfs.encrypt.data.transfer.cipher.suites=test
\ No newline at end of file
+# timeout modifier for when the consumer's cache of intermediate results are flushed to HDFS
+consumerTimeout=600000
\ No newline at end of file
diff --git a/rpm/resources/egress.properties b/rpm/resources/egress.properties
new file mode 100644
index 00000000..75b76de3
--- /dev/null
+++ b/rpm/resources/egress.properties
@@ -0,0 +1,20 @@
+# HDFS pruning, use 157784760000 value while testing HDFS writes to ensure the test records are not pruned. 157784760000L
+pruneOffset=157784760000
+# HDFS uri
+hdfsuri=hdfs://localhost:45937/
+# HDFS path
+hdfsPath=hdfs:///opt/teragrep/cfe_39/srv/
+# Kerberos
+java.security.krb5.kdc=test
+java.security.krb5.realm=test
+hadoop.security.authentication=kerberos
+hadoop.security.authorization=test
+dfs.namenode.kerberos.principal.pattern=test
+KerberosKeytabUser=test
+KerberosKeytabPath=test
+dfs.client.use.datanode.hostname=false
+hadoop.kerberos.keytab.login.autorenewal.enabled=true
+dfs.data.transfer.protection=test
+dfs.encrypt.data.transfer.cipher.suites=test
+# The maximum file size for AVRO-files that are to be stored in HDFS database.
+maximumFileSize=3000
\ No newline at end of file
diff --git a/rpm/resources/ingress.properties b/rpm/resources/ingress.properties
new file mode 100644
index 00000000..fd123af9
--- /dev/null
+++ b/rpm/resources/ingress.properties
@@ -0,0 +1,24 @@
+# Kafka security configuration file
+java.security.auth.login.config=/opt/teragrep/cfe_39/etc/config.jaas
+# Kafka bootstrap servers
+bootstrap.servers=test
+# Offset, should not be touched
+auto.offset.reset=earliest
+# Autocommit, should not be touched
+enable.auto.commit=false
+# Consumer group id, this is to track the progress of reading hte topic
+group.id=cfe_39
+# Used security protocol and mechanism
+security.protocol=SASL_PLAINTEXT
+sasl.mechanism=PLAIN
+# Maximum records per batch, note that too big number will cause massive load and can cause timeouts to trigger
+max.poll.records=500
+# How much data can be fetched in one go
+fetch.max.bytes=1073741820
+# How long for request before timing out. Note that too big max poll records size can cause this to trigger
+request.timeout.ms=300000
+max.poll.interval.ms=300000
+# For testing only
+useMockKafkaConsumer=true
+# Number of consumers created to the consumer groups
+numOfConsumers=2
\ No newline at end of file
diff --git a/src/main/java/com/teragrep/cfe_39/Config.java b/src/main/java/com/teragrep/cfe_39/Config.java
deleted file mode 100644
index c29e1ed9..00000000
--- a/src/main/java/com/teragrep/cfe_39/Config.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * HDFS Data Ingestion for PTH_06 use CFE-39
- * Copyright (C) 2021-2024 Suomen Kanuuna Oy
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- *
- * Additional permission under GNU Affero General Public License version 3
- * section 7
- *
- * If you modify this Program, or any covered work, by linking or combining it
- * with other code, such other code is not for that reason alone subject to any
- * of the requirements of the GNU Affero GPL version 3 as long as this Program
- * is the same Program as licensed from Suomen Kanuuna Oy without any additional
- * modifications.
- *
- * Supplemented terms under GNU Affero General Public License version 3
- * section 7
- *
- * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
- * versions must be marked as "Modified version of" The Program.
- *
- * Names of the licensors and authors may not be used for publicity purposes.
- *
- * No rights are granted for use of trade names, trademarks, or service marks
- * which are in The Program if any.
- *
- * Licensee must indemnify licensors and authors for any liability that these
- * contractual assumptions impose on licensors and authors.
- *
- * To the extent this program is licensed as part of the Commercial versions of
- * Teragrep, the applicable Commercial License may apply to this file if you as
- * a licensee so wish it.
- */
-package com.teragrep.cfe_39;
-
-import org.apache.logging.log4j.core.config.Configurator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Enumeration;
-import java.util.Properties;
-
-public class Config {
-
- private final String queueTopicPattern;
- private final Properties kafkaConsumerProperties;
- private static final Logger LOGGER = LoggerFactory.getLogger(Config.class);
- private final String hdfsPath;
- private String hdfsuri;
- private final String queueDirectory;
- private final String kerberosHost;
- private final String kerberosRealm;
- private final String kerberosPrincipal;
- private final String hadoopAuthentication;
- private final String hadoopAuthorization;
- private final String kerberosKeytabUser;
- private final String kerberosKeytabPath;
- private final String kerberosLoginAutorenewal;
- private final String kerberosTestMode;
- private long maximumFileSize;
- private final int numOfConsumers;
- private final long pruneOffset;
- private final boolean skipNonRFC5424Records;
- private final boolean skipEmptyRFC5424Records;
- private final String dfsDataTransferProtection;
- private final String dfsEncryptDataTransferCipherSuites;
-
- public Config() throws IOException {
- Properties properties = new Properties();
- Path configPath = Paths
- .get(System.getProperty("cfe_39.config.location", "/opt/teragrep/cfe_39/etc/application.properties"));
- LOGGER.info("Loading application config <[{}]>", configPath.toAbsolutePath());
-
- try (InputStream inputStream = Files.newInputStream(configPath)) {
- properties.load(inputStream);
- LOGGER.debug("Got configuration: <{}>", properties);
- }
-
- // HDFS
- this.hdfsPath = properties.getProperty("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
- this.hdfsuri = properties.getProperty("hdfsuri");
- if (this.hdfsuri == null) {
- throw new IllegalArgumentException("hdfsuri not set");
- }
-
- // HDFS pruning
- this.pruneOffset = Long.parseLong(properties.getProperty("pruneOffset", "172800000"));
- if (this.pruneOffset <= 0) {
- throw new IllegalArgumentException("pruneOffset must be set to >0, got " + pruneOffset);
- }
-
- // AVRO
- this.queueDirectory = properties.getProperty("queueDirectory", System.getProperty("user.dir") + "/etc/AVRO/");
- this.maximumFileSize = Long.parseLong(properties.getProperty("maximumFileSize", "60800000"));
- if (this.maximumFileSize <= 0) {
- throw new IllegalArgumentException("maximumFileSize must be set to >0, got " + maximumFileSize);
- }
-
- // kerberos
- this.kerberosHost = properties.getProperty("java.security.krb5.kdc");
- if (this.kerberosHost == null) {
- throw new IllegalArgumentException("kerberosHost not set");
- }
- this.kerberosRealm = properties.getProperty("java.security.krb5.realm");
- if (this.kerberosRealm == null) {
- throw new IllegalArgumentException("kerberosRealm not set");
- }
- this.hadoopAuthentication = properties.getProperty("hadoop.security.authentication");
- if (this.hadoopAuthentication == null) {
- throw new IllegalArgumentException("hadoopAuthentication not set");
- }
- this.hadoopAuthorization = properties.getProperty("hadoop.security.authorization");
- if (this.hadoopAuthorization == null) {
- throw new IllegalArgumentException("hadoopAuthorization not set");
- }
- this.kerberosPrincipal = properties.getProperty("dfs.namenode.kerberos.principal.pattern");
- if (this.kerberosPrincipal == null) {
- throw new IllegalArgumentException("kerberosPrincipal not set");
- }
- this.kerberosKeytabUser = properties.getProperty("KerberosKeytabUser");
- if (this.kerberosKeytabUser == null) {
- throw new IllegalArgumentException("kerberosKeytabUser not set");
- }
- this.kerberosKeytabPath = properties.getProperty("KerberosKeytabPath");
- if (this.kerberosKeytabPath == null) {
- throw new IllegalArgumentException("kerberosKeytabPath not set");
- }
- this.kerberosLoginAutorenewal = properties.getProperty("kerberosLoginAutorenewal");
- if (this.kerberosLoginAutorenewal == null) {
- throw new IllegalArgumentException("kerberosLoginAutorenewal not set");
- }
- this.kerberosTestMode = properties.getProperty("dfs.client.use.datanode.hostname", "false");
-
- this.dfsDataTransferProtection = properties.getProperty("dfs.data.transfer.protection");
- if (this.dfsDataTransferProtection == null) {
- throw new IllegalArgumentException("dfsDataTransferProtection not set");
- }
- this.dfsEncryptDataTransferCipherSuites = properties.getProperty("dfs.encrypt.data.transfer.cipher.suites");
- if (this.dfsEncryptDataTransferCipherSuites == null) {
- throw new IllegalArgumentException("dfsEncryptDataTransferCipherSuites not set");
- }
-
- // kafka
- this.queueTopicPattern = properties.getProperty("queueTopicPattern", "^.*$");
- this.numOfConsumers = Integer.parseInt(properties.getProperty("numOfConsumers", "1"));
-
- // skip non RFC5424 records
- this.skipNonRFC5424Records = properties.getProperty("skipNonRFC5424Records", "false").equalsIgnoreCase("true");
-
- // skip empty RFC5424 records
- this.skipEmptyRFC5424Records = properties
- .getProperty("skipEmptyRFC5424Records", "false")
- .equalsIgnoreCase("true");
-
- this.kafkaConsumerProperties = loadSubProperties(properties, "consumer.");
- String loginConfig = properties
- .getProperty("java.security.auth.login.config", System.getProperty("user.dir") + "/rpm/resources/config.jaas");
- if (loginConfig == null) {
- throw new IOException("Property java.security.auth.login.config does not exist");
- }
- if (!(new File(loginConfig)).isFile()) {
- throw new IOException("File '" + loginConfig + "' set by java.security.auth.login.config does not exist");
- }
-
- // Just for loggers to work
- Path log4j2Config = Paths
- .get(properties.getProperty("log4j2.configurationFile", System.getProperty("user.dir") + "/rpm/resources/log4j2.properties"));
- LOGGER.info("Loading log4j2 config from <[{}]>", log4j2Config.toRealPath());
- Configurator.reconfigure(log4j2Config.toUri());
- }
-
- private Properties loadSubProperties(Properties properties, String prefix) {
- Properties subProperties = new Properties();
-
- Enumeration keys = properties.keys();
- while (keys.hasMoreElements()) {
- String key = String.valueOf(keys.nextElement());
- if (key.startsWith(prefix)) {
- String value = properties.getProperty(key);
- String subKey = key.replaceFirst(prefix, "");
- subProperties.put(subKey, value);
- }
- }
- return subProperties;
- }
-
- public String getHdfsPath() {
- return hdfsPath;
- }
-
- public void setHdfsuri(String input) {
- this.hdfsuri = input;
- }
-
- public String getHdfsuri() {
- return hdfsuri;
- }
-
- public String getQueueDirectory() {
- return queueDirectory;
- }
-
- public String getQueueTopicPattern() {
- return queueTopicPattern;
- }
-
- public Properties getKafkaConsumerProperties() {
- return kafkaConsumerProperties;
- }
-
- public String getKerberosHost() {
- return kerberosHost;
- }
-
- public String getKerberosRealm() {
- return kerberosRealm;
- }
-
- public String getKerberosPrincipal() {
- return kerberosPrincipal;
- }
-
- public String getHadoopAuthentication() {
- return hadoopAuthentication;
- }
-
- public String getHadoopAuthorization() {
- return hadoopAuthorization;
- }
-
- public String getKerberosKeytabUser() {
- return kerberosKeytabUser;
- }
-
- public String getKerberosKeytabPath() {
- return kerberosKeytabPath;
- }
-
- public String getKerberosTestMode() {
- return kerberosTestMode;
- }
-
- public long getMaximumFileSize() {
- return maximumFileSize;
- }
-
- public void setMaximumFileSize(long maximumFileSize) {
- this.maximumFileSize = maximumFileSize;
- }
-
- public int getNumOfConsumers() {
- return numOfConsumers;
- }
-
- public long getPruneOffset() {
- return pruneOffset;
- }
-
- public boolean getSkipNonRFC5424Records() {
- return skipNonRFC5424Records;
- }
-
- public boolean getSkipEmptyRFC5424Records() {
- return skipEmptyRFC5424Records;
- }
-
- public String getKerberosLoginAutorenewal() {
- return kerberosLoginAutorenewal;
- }
-
- public String getDfsDataTransferProtection() {
- return dfsDataTransferProtection;
- }
-
- public String getDfsEncryptDataTransferCipherSuites() {
- return dfsEncryptDataTransferCipherSuites;
- }
-}
diff --git a/src/main/java/com/teragrep/cfe_39/Main.java b/src/main/java/com/teragrep/cfe_39/Main.java
index bb4e633c..4ba8d170 100644
--- a/src/main/java/com/teragrep/cfe_39/Main.java
+++ b/src/main/java/com/teragrep/cfe_39/Main.java
@@ -45,31 +45,70 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
+import com.teragrep.cnf_01.ConfigurationException;
+import com.teragrep.cnf_01.PathConfiguration;
+import org.apache.logging.log4j.core.config.Configurator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
-public class Main {
+public final class Main {
private static final Logger LOGGER = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) throws Exception {
- Config config = null;
+ // CommonConfiguration
+ final PathConfiguration pathConfiguration = new PathConfiguration(
+ System.getProperty("cfe_39.config.location", "/opt/teragrep/cfe_39/etc/application.properties")
+ );
+ final Map map;
try {
- config = new Config();
+ map = pathConfiguration.asMap();
}
- catch (IOException e) {
- LOGGER.error("Can't load config: ", e);
- System.exit(1);
+ catch (ConfigurationException e) {
+ LOGGER.error("Failed to create PathConfiguration: <{}>", e.getMessage());
+ throw e;
}
- catch (IllegalArgumentException e) {
- LOGGER.error("Got invalid config: ", e);
- System.exit(1);
+ CommonConfiguration commonConfig = new CommonConfiguration(map);
+
+ // log4j2 configuration
+ Path log4j2Config = Paths
+ .get(commonConfig.log4j2ConfigurationFile(), System.getProperty("user.dir") + "/rpm/resources/log4j2.properties");
+ Configurator.reconfigure(log4j2Config.toUri());
+
+ // KafkaConfiguration
+ final PathConfiguration kafkaPathConfiguration = new PathConfiguration(commonConfig.ingressConfigurationFile());
+ final Map kafkaMap;
+ try {
+ kafkaMap = kafkaPathConfiguration.asMap();
}
+ catch (ConfigurationException e) {
+ LOGGER.error("Failed to create PathConfiguration: <{}>", e.getMessage());
+ throw e;
+ }
+ KafkaConfiguration kafkaConfig = new KafkaConfiguration(kafkaMap);
+
+ // HdfsConfiguration
+ final PathConfiguration hdfsPathConfiguration = new PathConfiguration(commonConfig.egressConfigurationFile());
+ final Map hdfsMap;
+ try {
+ hdfsMap = hdfsPathConfiguration.asMap();
+ }
+ catch (ConfigurationException e) {
+ LOGGER.error("Failed to create PathConfiguration: <{}>", e.getMessage());
+ throw e;
+ }
+ HdfsConfiguration hdfsConfig = new HdfsConfiguration(hdfsMap);
+
LOGGER.info("Running main program");
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(commonConfig, hdfsConfig, kafkaConfig);
hdfsDataIngestion.run();
}
}
diff --git a/src/main/java/com/teragrep/cfe_39/configuration/CommonConfiguration.java b/src/main/java/com/teragrep/cfe_39/configuration/CommonConfiguration.java
new file mode 100644
index 00000000..4b14b22f
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/configuration/CommonConfiguration.java
@@ -0,0 +1,135 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.configuration;
+
+import org.apache.logging.log4j.core.config.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public final class CommonConfiguration {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(CommonConfiguration.class);
+
+ private final Map config;
+
+ public CommonConfiguration(Map map) {
+ this.config = map;
+ }
+
+ // printers for configuration file paths.
+
+ public String egressConfigurationFile() {
+ return config
+ .getOrDefault("egress.configurationFile", System.getProperty("user.dir") + "/rpm/resources/egress.properties");
+ }
+
+ public String ingressConfigurationFile() {
+ return config
+ .getOrDefault("ingress.configurationFile", System.getProperty("user.dir") + "/rpm/resources/ingress.properties");
+ }
+
+ public String log4j2ConfigurationFile() {
+ return config
+ .getOrDefault("log4j2.configurationFile", System.getProperty("user.dir") + "/rpm/resources/log4j2.properties");
+ }
+
+ // printers for the configuration parameters.
+
+ public String queueTopicPattern() {
+ return config.getOrDefault("queueTopicPattern", ".*");
+ }
+
+ public String queueDirectory() {
+ return config.getOrDefault("queueDirectory", System.getProperty("user.dir") + "/rpm/resources/queue");
+ }
+
+ public boolean skipNonRFC5424Records() {
+ final String skipString = config.get("skipNonRFC5424Records");
+ if (skipString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return Boolean.parseBoolean(skipString);
+ }
+ }
+
+ public boolean skipEmptyRFC5424Records() {
+ final String skipString = config.get("skipEmptyRFC5424Records");
+ if (skipString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return Boolean.parseBoolean(skipString);
+ }
+ }
+
+ public long consumerTimeout() {
+ final String consumerTimeoutString = config.get("consumerTimeout");
+ if (consumerTimeoutString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long consumerTimeout;
+ try {
+ consumerTimeout = Long.parseLong(consumerTimeoutString);
+ }
+ catch (NumberFormatException e) {
+ throw new RuntimeException(e);
+ }
+ if (consumerTimeout <= 0) {
+ throw new ConfigurationException(
+ "Configuration error. must be a positive long value."
+ );
+ }
+ else {
+ return consumerTimeout;
+ }
+ }
+ }
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/configuration/HdfsConfiguration.java b/src/main/java/com/teragrep/cfe_39/configuration/HdfsConfiguration.java
new file mode 100644
index 00000000..2d1aeb1c
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/configuration/HdfsConfiguration.java
@@ -0,0 +1,251 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.configuration;
+
+import org.apache.logging.log4j.core.config.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public final class HdfsConfiguration {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(HdfsConfiguration.class);
+
+ private final Map config;
+
+ public HdfsConfiguration(Map config) {
+ this.config = config;
+ }
+
+ // printers for the configuration parameters.
+
+ public long pruneOffset() {
+ final String numString = config.get("pruneOffset");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long pruneOffset;
+ try {
+ pruneOffset = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (pruneOffset <= 0) {
+ throw new ConfigurationException("Configuration error. must be a positive integer.");
+ }
+ else {
+ return pruneOffset;
+ }
+ }
+ }
+
+ public String hdfsUri() {
+ final String hdfsUri = config.get("hdfsuri");
+ if (hdfsUri == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return hdfsUri;
+ }
+ }
+
+ public String hdfsPath() {
+ final String hdfsPath = config.get("hdfsPath");
+ if (hdfsPath == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return hdfsPath;
+ }
+ }
+
+ public String javaSecurityKrb5Kdc() {
+ final String javaSecurityKrb5Kdc = config.get("java.security.krb5.kdc");
+ if (javaSecurityKrb5Kdc == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return javaSecurityKrb5Kdc;
+ }
+ }
+
+ public String javaSecurityKrb5Realm() {
+ final String javaSecurityKrb5Realm = config.get("java.security.krb5.realm");
+ if (javaSecurityKrb5Realm == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return javaSecurityKrb5Realm;
+ }
+ }
+
+ public String hadoopSecurityAuthentication() {
+ final String hadoopSecurityAuthentication = config.get("hadoop.security.authentication");
+ if (hadoopSecurityAuthentication == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return hadoopSecurityAuthentication;
+ }
+ }
+
+ public String hadoopSecurityAuthorization() {
+ final String hadoopSecurityAuthorization = config.get("hadoop.security.authorization");
+ if (hadoopSecurityAuthorization == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return hadoopSecurityAuthorization;
+ }
+ }
+
+ public String dfsNamenodeKerberosPrincipalPattern() {
+ final String dfsNamenodeKerberosPrincipalPattern = config.get("dfs.namenode.kerberos.principal.pattern");
+ if (dfsNamenodeKerberosPrincipalPattern == null) {
+ throw new ConfigurationException(
+ "Configuration error. must be set."
+ );
+ }
+ else {
+ return dfsNamenodeKerberosPrincipalPattern;
+ }
+ }
+
+ public String KerberosKeytabUser() {
+ final String KerberosKeytabUser = config.get("KerberosKeytabUser");
+ if (KerberosKeytabUser == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return KerberosKeytabUser;
+ }
+ }
+
+ public String KerberosKeytabPath() {
+ final String KerberosKeytabPath = config.get("KerberosKeytabPath");
+ if (KerberosKeytabPath == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return KerberosKeytabPath;
+ }
+ }
+
+ public String dfsClientUseDatanodeHostname() {
+ final String dfsClientUseDatanodeHostname = config.get("dfs.client.use.datanode.hostname");
+ if (dfsClientUseDatanodeHostname == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return dfsClientUseDatanodeHostname;
+ }
+ }
+
+ public String hadoopKerberosKeytabLoginAutorenewalEnabled() {
+ final String hadoopKerberosKeytabLoginAutorenewalEnabled = config
+ .get("hadoop.kerberos.keytab.login.autorenewal.enabled");
+ if (hadoopKerberosKeytabLoginAutorenewalEnabled == null) {
+ throw new ConfigurationException(
+ "Configuration error. must be set."
+ );
+ }
+ else {
+ return hadoopKerberosKeytabLoginAutorenewalEnabled;
+ }
+ }
+
+ public String dfsDataTransferProtection() {
+ final String dfsDataTransferProtection = config.get("dfs.data.transfer.protection");
+ if (dfsDataTransferProtection == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return dfsDataTransferProtection;
+ }
+ }
+
+ public String dfsEncryptDataTransferCipherSuites() {
+ final String dfsEncryptDataTransferCipherSuites = config.get("dfs.encrypt.data.transfer.cipher.suites");
+ if (dfsEncryptDataTransferCipherSuites == null) {
+ throw new ConfigurationException(
+ "Configuration error. must be set."
+ );
+ }
+ else {
+ return dfsEncryptDataTransferCipherSuites;
+ }
+ }
+
+ public long maximumFileSize() {
+ final String numString = config.get("maximumFileSize");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long maximumFileSize;
+ try {
+ maximumFileSize = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (maximumFileSize <= 0) {
+ throw new ConfigurationException(
+ "Configuration error. must be a positive long value."
+ );
+ }
+ else {
+ return maximumFileSize;
+ }
+ }
+ }
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/configuration/KafkaConfiguration.java b/src/main/java/com/teragrep/cfe_39/configuration/KafkaConfiguration.java
new file mode 100644
index 00000000..8c27bf35
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/configuration/KafkaConfiguration.java
@@ -0,0 +1,261 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.configuration;
+
+import org.apache.logging.log4j.core.config.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public final class KafkaConfiguration {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(KafkaConfiguration.class);
+
+ private final Map config;
+
+ public KafkaConfiguration(Map config) {
+ this.config = config;
+ }
+
+ // printers for the configuration parameters.
+
+ public String javaSecurityAuthLoginConfig() {
+ final String javaSecurityAuthLoginConfig = config.get("java.security.auth.login.config");
+ if (javaSecurityAuthLoginConfig == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return javaSecurityAuthLoginConfig;
+ }
+ }
+
+ public String bootstrapServers() {
+ final String bootstrapServers = config.get("bootstrap.servers");
+ if (bootstrapServers == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return bootstrapServers;
+ }
+ }
+
+ public String autoOffsetReset() {
+ final String autoOffsetReset = config.get("auto.offset.reset");
+ if (autoOffsetReset == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return autoOffsetReset;
+ }
+ }
+
+ public String enableAutoCommit() {
+ final String enableAutoCommit = config.get("enable.auto.commit");
+ if (enableAutoCommit == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return enableAutoCommit;
+ }
+ }
+
+ public String groupId() {
+ final String groupId = config.get("group.id");
+ if (groupId == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return groupId;
+ }
+ }
+
+ public String securityProtocol() {
+ final String securityProtocol = config.get("security.protocol");
+ if (securityProtocol == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return securityProtocol;
+ }
+ }
+
+ public String saslMechanism() {
+ final String saslMechanism = config.get("sasl.mechanism");
+ if (saslMechanism == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return saslMechanism;
+ }
+ }
+
+ public long maxPollRecords() {
+ final String numString = config.get("max.poll.records");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long maxPollRecords;
+ try {
+ maxPollRecords = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (maxPollRecords < 0) {
+ throw new ConfigurationException("Configuration error. must be a positive value.");
+ }
+ else {
+ return maxPollRecords;
+ }
+ }
+ }
+
+ public long fetchMaxBytes() {
+ final String numString = config.get("fetch.max.bytes");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long fetchMaxBytes;
+ try {
+ fetchMaxBytes = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (fetchMaxBytes < 0) {
+ throw new ConfigurationException("Configuration error. must be a positive value.");
+ }
+ else {
+ return fetchMaxBytes;
+ }
+ }
+ }
+
+ public long requestTimeoutMs() {
+ final String numString = config.get("request.timeout.ms");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long requestTimeoutMs;
+ try {
+ requestTimeoutMs = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (requestTimeoutMs < 0) {
+ throw new ConfigurationException("Configuration error. must be a positive value.");
+ }
+ else {
+ return requestTimeoutMs;
+ }
+ }
+ }
+
+ public long maxPollIntervalMs() {
+ final String numString = config.get("max.poll.interval.ms");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final long maxPollIntervalMs;
+ try {
+ maxPollIntervalMs = Long.parseLong(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (maxPollIntervalMs < 0) {
+ throw new ConfigurationException("Configuration error. must be a positive value.");
+ }
+ else {
+ return maxPollIntervalMs;
+ }
+ }
+ }
+
+ public boolean useMockKafkaConsumer() {
+ final String useMockKafkaConsumer = config.get("useMockKafkaConsumer");
+ if (useMockKafkaConsumer == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ return Boolean.parseBoolean(useMockKafkaConsumer);
+ }
+ }
+
+ public int numOfConsumers() {
+ final String numString = config.get("numOfConsumers");
+ if (numString == null) {
+ throw new ConfigurationException("Configuration error. must be set.");
+ }
+ else {
+ final int numOfConsumers;
+ try {
+ numOfConsumers = Integer.parseInt(numString);
+ }
+ catch (NumberFormatException e) {
+ LOGGER.error("Configuration error. Invalid value for : <{}>", e.getMessage());
+ throw new RuntimeException(e);
+ }
+ if (numOfConsumers <= 0) {
+ throw new ConfigurationException("Configuration error. must be a positive integer.");
+ }
+ else {
+ return numOfConsumers;
+ }
+ }
+ }
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistribution.java
similarity index 92%
rename from src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java
rename to src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistribution.java
index ada4c4fd..6f141b62 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/Offset.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistribution.java
@@ -45,11 +45,10 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-public interface Offset {
+import java.util.List;
+import java.util.function.Consumer;
- boolean isNull();
+public interface BatchDistribution extends Consumer> {
- byte[] getRecord();
-
- String offsetToJSON();
+ public abstract void rebalance();
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistributionImpl.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistributionImpl.java
new file mode 100644
index 00000000..ce1781c3
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/BatchDistributionImpl.java
@@ -0,0 +1,200 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.google.gson.*;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.metrics.topic.TopicCounter;
+import com.teragrep.cfe_39.metrics.DurationStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.codahale.metrics.Timer;
+
+import java.io.*;
+import java.time.Instant;
+import java.util.*;
+
+/* The kafka stream should first be deserialized using rlo_06 and then serialized again using avro and stored in HDFS.
+ The target where the record is stored in HDFS is based on the topic, partition and offset. ie. topic_name/0.123456 where offset is 123456
+ The mock consumer is activated for testing using the configuration file: readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")*/
+
+public final class BatchDistributionImpl implements BatchDistribution {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(BatchDistributionImpl.class);
+
+ private final String topic;
+ private final DurationStatistics durationStatistics;
+ private final TopicCounter topicCounter;
+ private long lastTimeCalled;
+ private final Map partitionFileMap;
+ private final PartitionFileFactory partitionFileFactory;
+
+ public BatchDistributionImpl(
+ CommonConfiguration config,
+ HdfsConfiguration hdfsConfig,
+ String topic,
+ DurationStatistics durationStatistics,
+ TopicCounter topicCounter
+ ) {
+ this(
+ topic,
+ durationStatistics,
+ topicCounter,
+ new HashMap<>(),
+ Instant.now().toEpochMilli(),
+ new PartitionFileFactory(config, hdfsConfig)
+ );
+ }
+
+ public BatchDistributionImpl(
+ String topic,
+ DurationStatistics durationStatistics,
+ TopicCounter topicCounter,
+ Map partitionFileMap,
+ long lastTimeCalled,
+ PartitionFileFactory partitionFileFactory
+ ) {
+ this.topic = topic;
+ this.durationStatistics = durationStatistics;
+ this.topicCounter = topicCounter;
+ this.partitionFileMap = partitionFileMap;
+ this.lastTimeCalled = lastTimeCalled;
+ this.partitionFileFactory = partitionFileFactory;
+ }
+
+ /* Input parameter is a batch of RecordOffsetObjects from kafka. Each object contains a record and its metadata (topic, partition and offset).
+ * Distributes the received kafka record batch to PartitionFileImpl objects based on topic partition which the record originates from.
+ * */
+ @Override
+ public void accept(List batch) {
+ long thisTime = Instant.now().toEpochMilli();
+ long ftook = thisTime - lastTimeCalled;
+ topicCounter.setKafkaLatency(ftook);
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "Searching batch for <[{}]> with <{}> records took <{}> milliseconds. <{}> EPS. ", topic,
+ batch.size(), (ftook), (batch.size() * 1000L / ftook)
+ );
+ }
+ long batchBytes = 0L;
+ Timer timer = new Timer();
+ Timer.Context context = timer.time();
+
+ // Distribute the records of the batch to a PartitionFileImpl object based on partition from which the record originates from.
+ ListIterator recordOffsetListIterator = batch.listIterator();
+ while (recordOffsetListIterator.hasNext()) {
+ KafkaRecordImpl next = recordOffsetListIterator.next();
+ // If the PartitionFileImpl corresponding to the record's partition doesn't exist, create one.
+ if (!partitionFileMap.containsKey(Integer.toString(next.topicPartition().partition()))) {
+ try {
+ partitionFileMap
+ .put(Integer.toString(next.topicPartition().partition()), partitionFileFactory.partitionFor(next.topicPartition()));
+ }
+ catch (IOException e) {
+ LOGGER.error("Failed to create new PartitionFileImpl for record <{}>", next.topicPartition());
+ throw new RuntimeException(e);
+ }
+ }
+ // Every PartitionFileImpl object will hold responsibility over a single unique file that is related to a single topic partition.
+ PartitionFileImpl recordPartitionFile = partitionFileMap
+ .get(Integer.toString(next.topicPartition().partition()));
+ // Tell PartitionFileImpl to add the current record to the list of records that are going to be added to the file.
+ recordPartitionFile.addRecord(next);
+ batchBytes = batchBytes + next.size(); // metrics
+ }
+
+ // When all records in the current batch have been distributed to different PartitionFileImpl objects successfully, proceed to adding the records to the files for all PartitionFileImpl objects.
+ partitionFileMap.forEach((key, value) -> {
+ try {
+ value.commitRecords();
+ }
+ catch (IOException e) {
+ LOGGER.error("Failed to write the SyslogRecords to PartitionFileImpl <{}> in topic <{}>", key, topic);
+ // Cleanup resources
+ partitionFileMap.forEach((cleanupKey, cleanupValue) -> {
+ cleanupValue.delete();
+ });
+ throw new RuntimeException(e);
+ }
+ });
+
+ // Measure performance.
+ long took = context.stop() / 1000000L; // Convert nanoseconds to milliseconds.
+ topicCounter.setDatabaseLatency(took);
+ if (took == 0) {
+ took = 1;
+ }
+ long rps = batch.size() * 1000L / took;
+ topicCounter.setRecordsPerSecond(rps);
+ long bps = batchBytes * 1000 / took;
+ topicCounter.setBytesPerSecond(bps);
+ durationStatistics.addAndGetRecords(batch.size());
+ durationStatistics.addAndGetBytes(batchBytes);
+ topicCounter.addToTotalBytes(batchBytes);
+ topicCounter.addToTotalRecords(batch.size());
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "Sent batch for <[{}]> with records <{}> and size <{}> KB took <{}> milliseconds. <{}> RPS. <{}> KB/s ",
+ topic, batch.size(), batchBytes / 1024, (took), rps, bps / 1024
+ );
+ }
+ lastTimeCalled = Instant.now().toEpochMilli();
+ }
+
+ @Override
+ public void rebalance() {
+ // Handle rebalancing here. Store all remaining records of all PartitionFile objects to HDFS.
+ accept(new ArrayList<>()); // Will write all files with records still in them to HDFS.
+ // Delete all PartitionFile objects from the partitionFileMap. Must also delete the files linked to the objects.
+ partitionFileMap.forEach((key, value) -> {
+ value.delete();
+ });
+ partitionFileMap.clear();
+ }
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/ConsumerRebalanceListenerImpl.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ConsumerRebalanceListenerImpl.java
new file mode 100644
index 00000000..2934bbe2
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ConsumerRebalanceListenerImpl.java
@@ -0,0 +1,116 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.kafka.clients.consumer.Consumer;
+import org.apache.kafka.clients.consumer.ConsumerRebalanceListener;
+import org.apache.kafka.common.TopicPartition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+public final class ConsumerRebalanceListenerImpl implements ConsumerRebalanceListener {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(ConsumerRebalanceListenerImpl.class);
+
+ private final Consumer kafkaConsumer;
+ private final BatchDistributionImpl callbackFunction;
+ private final HdfsConfiguration config;
+
+ public ConsumerRebalanceListenerImpl(
+ Consumer kafkaConsumer,
+ BatchDistributionImpl callbackFunction,
+ HdfsConfiguration config
+ ) {
+ this.kafkaConsumer = kafkaConsumer;
+ this.callbackFunction = callbackFunction;
+ this.config = config;
+ }
+
+ @Override
+ public void onPartitionsRevoked(Collection partitions) {
+ // Flush any records from the temporary files to HDFS to synchronize database with committed kafka offsets, and clean up PartitionFile list.
+ LOGGER.info("onPartitionsRevoked triggered");
+ callbackFunction.rebalance();
+ }
+
+ @Override
+ public void onPartitionsAssigned(Collection partitions) {
+ // Generates offsets of the already committed records for Kafka and passes them to the kafka consumers.
+ LOGGER.info("onPartitionsAssigned triggered");
+ // Initialize FileSystem
+ FileSystemFactoryImpl fileSystemFactoryImpl = new FileSystemFactoryImpl(config);
+ FileSystem fs;
+ try {
+ fs = fileSystemFactoryImpl.create(false);
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ Map hdfsStartOffsets = new HashMap<>();
+ try (HDFSRead hr = new HDFSRead(config, fs)) {
+ hdfsStartOffsets = hr.hdfsStartOffsets();
+ LOGGER.debug("topicPartitionStartMap generated succesfully: <{}>", hdfsStartOffsets);
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ for (TopicPartition topicPartition : partitions) {
+ if (hdfsStartOffsets.containsKey(topicPartition)) {
+ long position = kafkaConsumer.position(topicPartition);
+ if (position < hdfsStartOffsets.get(topicPartition)) {
+ kafkaConsumer.seek(topicPartition, hdfsStartOffsets.get(topicPartition));
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java
deleted file mode 100644
index ae519335..00000000
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/DatabaseOutput.java
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * HDFS Data Ingestion for PTH_06 use CFE-39
- * Copyright (C) 2021-2024 Suomen Kanuuna Oy
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- *
- * Additional permission under GNU Affero General Public License version 3
- * section 7
- *
- * If you modify this Program, or any covered work, by linking or combining it
- * with other code, such other code is not for that reason alone subject to any
- * of the requirements of the GNU Affero GPL version 3 as long as this Program
- * is the same Program as licensed from Suomen Kanuuna Oy without any additional
- * modifications.
- *
- * Supplemented terms under GNU Affero General Public License version 3
- * section 7
- *
- * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
- * versions must be marked as "Modified version of" The Program.
- *
- * Names of the licensors and authors may not be used for publicity purposes.
- *
- * No rights are granted for use of trade names, trademarks, or service marks
- * which are in The Program if any.
- *
- * Licensee must indemnify licensors and authors for any liability that these
- * contractual assumptions impose on licensors and authors.
- *
- * To the extent this program is licensed as part of the Commercial versions of
- * Teragrep, the applicable Commercial License may apply to this file if you as
- * a licensee so wish it.
- */
-package com.teragrep.cfe_39.consumers.kafka;
-
-import com.google.gson.*;
-import com.teragrep.cfe_39.Config;
-import com.teragrep.cfe_39.avro.SyslogRecord;
-import com.teragrep.cfe_39.consumers.kafka.queue.WritableQueue;
-import com.teragrep.cfe_39.metrics.topic.TopicCounter;
-import com.teragrep.cfe_39.metrics.DurationStatistics;
-import com.teragrep.rlo_06.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.*;
-import java.nio.charset.StandardCharsets;
-import java.time.Instant;
-import java.time.ZonedDateTime;
-import java.util.List;
-import java.util.function.Consumer;
-
-import java.nio.ByteBuffer;
-
-/* The kafka stream should first be deserialized using rlo_06 and then serialized again using avro and stored in HDFS.
- The target where the record is stored in HDFS is based on the topic, partition and offset. ie. topic_name/0.123456 where offset is 123456
- The mock consumer is activated for testing using the configuration file: readerKafkaProperties.getProperty("useMockKafkaConsumer", "false")*/
-
-public class DatabaseOutput implements Consumer> {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(DatabaseOutput.class);
- private final RFC5424Frame rfc5424Frame = new RFC5424Frame(false);
-
- private final String table;
-
- private final DurationStatistics durationStatistics;
- private final TopicCounter topicCounter;
-
- private long lastTimeCalled = Instant.now().toEpochMilli();
-
- private SyslogAvroWriter syslogAvroWriter;
- private final long maximumFileSize;
- private final WritableQueue writableQueue;
- private final ByteBuffer sourceConcatenationBuffer;
- private final SDVector teragrepStreamName;
- private final SDVector teragrepDirectory;
- private final SDVector eventNodeSourceSource;
- private final SDVector eventNodeRelaySource;
- private final SDVector eventNodeSourceSourceModule;
- private final SDVector eventNodeRelaySourceModule;
- private final SDVector eventNodeSourceHostname;
- private final SDVector eventNodeRelayHostname;
- private final SDVector originHostname;
- private File syslogFile;
- private final Config config;
- private final boolean skipNonRFC5424Records;
- private final boolean skipEmptyRFC5424Records;
-
- public DatabaseOutput(
- Config config,
- String table,
- DurationStatistics durationStatistics,
- TopicCounter topicCounter
- ) {
- this.config = config;
- this.table = table;
- this.durationStatistics = durationStatistics;
- this.topicCounter = topicCounter;
- this.maximumFileSize = config.getMaximumFileSize();
-
- // queueDirectory and queueNamePrefix are only used for temporarily storing the AVRO-serialized files before committing them to HDFS when the file size reaches the threshold (or all records are processed).
- this.writableQueue = new WritableQueue(config.getQueueDirectory(), table);
-
- this.sourceConcatenationBuffer = ByteBuffer.allocateDirect(256 * 1024);
- teragrepStreamName = new SDVector("teragrep@48577", "streamname");
- teragrepDirectory = new SDVector("teragrep@48577", "directory");
- this.eventNodeSourceSource = new SDVector("event_node_source@48577", "source");
- this.eventNodeRelaySource = new SDVector("event_node_relay@48577", "source");
- this.eventNodeSourceSourceModule = new SDVector("event_node_source@48577", "source_module");
- this.eventNodeRelaySourceModule = new SDVector("event_node_relay@48577", "source_module");
- this.eventNodeSourceHostname = new SDVector("event_node_source@48577", "hostname");
- this.eventNodeRelayHostname = new SDVector("event_node_relay@48577", "hostname");
- this.originHostname = new SDVector("origin@48577", "hostname");
- this.skipNonRFC5424Records = config.getSkipNonRFC5424Records();
- this.skipEmptyRFC5424Records = config.getSkipEmptyRFC5424Records();
- }
-
- // Checks that the filesize stays under the defined maximum file size. If the file is about to go over target limit commits the file to HDFS and returns true, otherwise does nothing and returns false.
- private boolean writeToHdfs(long fileSize, JsonObject recordOffsetObjectJo) {
- try {
- // If the syslogAvroWriter is already initialized, check the filesize so it doesn't go above maximumFileSize.
- if (fileSize > maximumFileSize) {
- // file too large for adding the new record to it, write the still adequately sized AVRO-file to the HDFS database and create a new empty AVRO-file.
-
- // This part closes the writing of now "complete" AVRO-file and stores the file to HDFS.
- syslogAvroWriter.close();
- try (HDFSWrite writer = new HDFSWrite(config, recordOffsetObjectJo)) {
- writer.commit(syslogFile); // commits the final AVRO-file to HDFS.
- }
- return true;
- }
- }
- catch (IOException ioException) {
- throw new UncheckedIOException(ioException);
- }
- return false;
- }
-
- private long rfc3339ToEpoch(ZonedDateTime zonedDateTime) {
- final Instant instant = zonedDateTime.toInstant();
-
- final long MICROS_PER_SECOND = 1000L * 1000L;
- final long NANOS_PER_MICROS = 1000L;
- final long sec = Math.multiplyExact(instant.getEpochSecond(), MICROS_PER_SECOND);
-
- return Math.addExact(sec, instant.getNano() / NANOS_PER_MICROS);
- }
-
- /* Input parameter is a list of RecordOffsetObjects. Each object contains a record and its metadata (topic, partition and offset).
- Each partition will get their set of exclusive AVRO-files in HDFS.
- The target where the record is stored in HDFS is based on the topic, partition and last offset. ie. topic_name/0.123456 where last written record's offset is 123456.
- AVRO-file with a path/name that starts with topic_name/0.X should only contain records from the 0th partition of topic named topic_name, topic_name/1.X should only contain records from 1st partition, etc.
- AVRO-files are created dynamically, thus it is not known which record (and its offset) is written to the file last before committing it to HDFS. The final name for the HDFS file is decided only when the file is committed to HDFS.*/
- @Override
- public void accept(List recordOffsetObjectList) {
- long thisTime = Instant.now().toEpochMilli();
- long ftook = thisTime - lastTimeCalled;
- topicCounter.setKafkaLatency(ftook);
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Fuura searching your batch for <[{}]> with records <{}> and took <{}> milliseconds. <{}> EPS. ",
- table, recordOffsetObjectList.size(), (ftook),
- (recordOffsetObjectList.size() * 1000L / ftook)
- );
- }
- long batchBytes = 0L;
-
- /* The recordOffsetObjectList loop will go through all the objects in the list.
- While it goes through the list, the contents of the objects are serialized into an AVRO-file.
- When the file size is about to go above 64M, commit the file into HDFS using the latest topic/partition/offset values as the filename and start fresh with a new empty AVRO-file.
- Serialize the object that was going to make the file go above 64M into the now empty AVRO-file and continue the loop.
- TODO: If the prod-environment recordOffsetObjectList ordering is different from what it is in the test environment, add a function that reorders the list based on partition and offset (or better yet, make several AVRO-files that are being used at the same time rather than doing it one AVRO-file at a time as the offset ordering within partitions should always be correct in all scenarios).*/
- Offset lastObject = new NullOffset(); // Set to null object before initializing as RecordOffsetObject.
- JsonObject lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject();
- long start = Instant.now().toEpochMilli(); // Starts measuring performance here. Measures how long it takes to process the whole recordOffsetObjectList.
- // This loop goes through all the records of the mock data in a single session.
- for (RecordOffset recordOffsetObject : recordOffsetObjectList) {
- JsonObject recordOffsetObjectJo = JsonParser
- .parseString(recordOffsetObject.offsetToJSON())
- .getAsJsonObject();
- // Initializing syslogAvroWriter and lastObject.
- if (syslogAvroWriter == null && lastObject.isNull()) {
- try {
- writableQueue
- .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString());
- syslogFile = writableQueue.getNextWritableFile();
- // The HDFS filename is only finalized when the AVRO-serialized file is finalized, because every Kafka-record added to the file is going to change the offset that is going to be used for the filename.
- syslogAvroWriter = new SyslogAvroWriter(syslogFile);
- lastObject = recordOffsetObject;
- lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject();
- }
- catch (IOException ioException) {
- throw new IllegalArgumentException(ioException);
- }
- }
- else {
- try {
- if (
- lastObjectJo.get("topic").getAsString().equals(recordOffsetObjectJo.get("topic").getAsString())
- && lastObjectJo.get("partition").getAsString().equals(recordOffsetObjectJo.get("partition").getAsString())
- ) {
- // Records left to consume in the current partition.
- boolean fileCommitted = writeToHdfs(syslogAvroWriter.getFileSize(), lastObjectJo);
- if (fileCommitted) {
- // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the size limit defined in config.
- writableQueue
- .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString());
- syslogFile = writableQueue.getNextWritableFile();
- syslogAvroWriter = new SyslogAvroWriter(syslogFile);
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Target file size reached, file <{}> stored to <{}> in HDFS", syslogFile
- .getName(),
- lastObjectJo.get("topic").getAsString() + "/" + lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString()
- );
- }
- }
- else {
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Target file size not yet reached, continuing writing records to <{}>.",
- syslogFile.getName()
- );
- }
- }
- }
- else {
- // Previous partition was fully consumed. Commit file to HDFS and create a new AVRO-file.
- syslogAvroWriter.close();
- HDFSWrite writer = new HDFSWrite(config, lastObjectJo);
- writer.commit(syslogFile);
-
- // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the 64M size limit.
- writableQueue
- .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString());
- syslogFile = writableQueue.getNextWritableFile();
- syslogAvroWriter = new SyslogAvroWriter(syslogFile);
- }
- }
- catch (IOException ioException) {
- throw new UncheckedIOException(ioException);
- }
- }
-
- byte[] byteArray = recordOffsetObject.getRecord(); // loads the byte[] contained in recordOffsetObject.getRecord() to byteArray.
- if (byteArray == null) {
- if (skipEmptyRFC5424Records) {
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Skipping processing an empty non RFC5424 record. Record metadata: {}",
- recordOffsetObject.offsetToJSON()
- );
- }
- continue;
- }
- else {
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("Null record metadata: {}", recordOffsetObject.offsetToJSON());
- }
- syslogFile.delete(); // Clean up
- throw new NullPointerException("Record with null content detected during processing.");
- }
-
- }
- InputStream inputStream = new ByteArrayInputStream(byteArray);
- rfc5424Frame.load(inputStream);
- try {
- if (rfc5424Frame.next()) {
- /*rfc5424Frame has loaded the record data, it's ready for deserialization.
- Implement AVRO serialization for the Kafka records here, preparing the data for writing to HDFS.
- Write all the data into a file using AVRO.
- The size of each AVRO-serialized file should be as close to 64M as possible.*/
-
- batchBytes = batchBytes + byteArray.length;
-
- // input
- final byte[] source = eventToSource();
-
- // origin
- final byte[] origin = eventToOrigin();
-
- // Format: Use AVRO format with syslog columns as indexed ones
- final long epochMicros = rfc3339ToEpoch(
- new RFC5424Timestamp(rfc5424Frame.timestamp).toZonedDateTime()
- );
- SyslogRecord syslogRecord = SyslogRecord
- .newBuilder()
- .setTimestamp(epochMicros)
- .setPayload(rfc5424Frame.msg.toString())
- .setDirectory(rfc5424Frame.structuredData.getValue(teragrepDirectory).toString())
- .setStream(rfc5424Frame.structuredData.getValue(teragrepStreamName).toString()) // Or is sourcetype/stream supposed to be rfc5424Frame.appName.toString() instead?
- .setHost(rfc5424Frame.hostname.toString())
- .setInput(new String(source, StandardCharsets.UTF_8))
- .setPartition(recordOffsetObjectJo.get("partition").getAsString())
- .setOffset(recordOffsetObjectJo.get("offset").getAsLong())
- .setOrigin(new String(origin, StandardCharsets.UTF_8))
- .build();
-
- // Calculate the size of syslogRecord that is going to be written to syslogAvroWriter-file.
- long capacity = syslogRecord.toByteBuffer().capacity();
- // Check if there is still room in syslogAvroWriter for another syslogRecord. Commit syslogAvroWriter to HDFS if no room left, emptying it out in the process.
- boolean fileCommitted = writeToHdfs(syslogAvroWriter.getFileSize() + capacity, lastObjectJo);
- if (fileCommitted) {
- // This part defines a new empty file to which the new AVRO-serialized records are stored until it again hits the size limit defined in config.
- writableQueue
- .setQueueNamePrefix(recordOffsetObjectJo.get("topic").getAsString() + recordOffsetObjectJo.get("partition").getAsString());
- syslogFile = writableQueue.getNextWritableFile();
- syslogAvroWriter = new SyslogAvroWriter(syslogFile);
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Target file size reached, file <{}> stored to <{}/{}.{}> in HDFS",
- syslogFile.getName(), lastObjectJo.get("topic").getAsString(), lastObjectJo.get("partition").getAsString(), lastObjectJo.get("offset").getAsString()
- );
- }
- }
- else {
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Target file size not yet reached, continuing writing records to <{}>.",
- syslogFile.getName()
- );
- }
- }
- // Add syslogRecord to syslogAvroWriter which has room for new syslogRecord.
- syslogAvroWriter.write(syslogRecord);
- lastObject = recordOffsetObject;
- lastObjectJo = JsonParser.parseString(lastObject.offsetToJSON()).getAsJsonObject();
- }
- }
- catch (IOException e) {
- throw new UncheckedIOException(e);
- }
- catch (ParseException e) {
- if (skipNonRFC5424Records) {
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Skipping processing a non RFC5424 record, record metadata: {}. Exception information: ",
- recordOffsetObject.offsetToJSON(), e
- );
- }
- continue;
- }
- else {
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Record metadata that is causing ParseException: {}.",
- recordOffsetObject.offsetToJSON()
- );
- }
- syslogFile.delete(); // Clean up
- throw new RuntimeException(e);
- }
- }
- }
-
- // Handle the "leftover" syslogRecords from the loop.
- try {
- if (syslogAvroWriter != null && !lastObject.isNull()) {
- syslogAvroWriter.close();
- try (HDFSWrite writer = new HDFSWrite(config, lastObjectJo)) {
- writer.commit(syslogFile); // commits the final AVRO-file to HDFS.
- }
- }
- }
- catch (IOException e) {
- throw new UncheckedIOException(e);
- }
-
- // Measures performance of code that is between start and end.
- long end = Instant.now().toEpochMilli();
-
- long took = (end - start);
- topicCounter.setDatabaseLatency(took);
-
- if (took == 0) {
- took = 1;
- }
- long rps = recordOffsetObjectList.size() * 1000L / took;
- topicCounter.setRecordsPerSecond(rps);
-
- long bps = batchBytes * 1000 / took;
- topicCounter.setBytesPerSecond(bps);
-
- durationStatistics.addAndGetRecords(recordOffsetObjectList.size());
- durationStatistics.addAndGetBytes(batchBytes);
-
- topicCounter.addToTotalBytes(batchBytes);
- topicCounter.addToTotalRecords(recordOffsetObjectList.size());
-
- if (LOGGER.isDebugEnabled()) {
- LOGGER
- .debug(
- "Sent batch for <[{}]> with records <{}> and size <{}> KB took <{}> milliseconds. <{}> RPS. <{}> KB/s ",
- table, recordOffsetObjectList.size(), batchBytes / 1024, (took), rps, bps / 1024
- );
- }
- lastTimeCalled = Instant.now().toEpochMilli();
- }
-
- private byte[] eventToOrigin() {
- byte[] origin;
- Fragment originFragment = rfc5424Frame.structuredData.getValue(originHostname);
- if (!originFragment.isStub) {
- origin = originFragment.toBytes();
- }
- else {
- origin = new byte[] {};
- }
- return origin;
- }
-
- private byte[] eventToSource() {
- /*input is produced from SD element event_node_source@48577 by
- concatenating "source_module:hostname:source". in case
- if event_node_source@48577 is not available use event_node_relay@48577.
- If neither are present, use null value.*/
-
- sourceConcatenationBuffer.clear();
-
- Fragment sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSourceModule);
- if (sourceModuleFragment.isStub) {
- sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySourceModule);
- }
-
- byte[] source_module;
- if (!sourceModuleFragment.isStub) {
- source_module = sourceModuleFragment.toBytes();
- }
- else {
- source_module = new byte[] {};
- }
-
- Fragment sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceHostname);
- if (sourceHostnameFragment.isStub) {
- sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeRelayHostname);
- }
-
- byte[] source_hostname;
- if (!sourceHostnameFragment.isStub) {
- source_hostname = sourceHostnameFragment.toBytes();
- }
- else {
- source_hostname = new byte[] {};
- }
-
- Fragment sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSource);
- if (sourceHostnameFragment.isStub) {
- sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySource);
- }
-
- byte[] source_source;
- if (!sourceSourceFragment.isStub) {
- source_source = sourceSourceFragment.toBytes();
- }
- else {
- source_source = new byte[] {};
- }
-
- sourceConcatenationBuffer.put(source_module);
- sourceConcatenationBuffer.put((byte) ':');
- sourceConcatenationBuffer.put(source_hostname);
- sourceConcatenationBuffer.put((byte) ':');
- sourceConcatenationBuffer.put(source_source);
-
- sourceConcatenationBuffer.flip();
- byte[] input = new byte[sourceConcatenationBuffer.remaining()];
- sourceConcatenationBuffer.get(input);
-
- return input;
- }
-}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactory.java
similarity index 83%
rename from src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java
rename to src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactory.java
index 08fa3f22..25562183 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/NullOffset.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactory.java
@@ -45,21 +45,12 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-// Null object design pattern, used to create null offset objects.
-public final class NullOffset implements Offset {
+import org.apache.hadoop.fs.FileSystem;
- @Override
- public boolean isNull() {
- return true;
- }
+import java.io.IOException;
- @Override
- public byte[] getRecord() {
- return new byte[0];
- }
+public interface FileSystemFactory {
+
+ public abstract FileSystem create(boolean initializeUGI) throws IOException;
- @Override
- public String offsetToJSON() {
- return "{\"topic\":\"Not available\", \"partition\":0, \"offset\":0}";
- }
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactoryImpl.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactoryImpl.java
new file mode 100644
index 00000000..22419c87
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/FileSystemFactoryImpl.java
@@ -0,0 +1,123 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import java.io.IOException;
+import java.net.URI;
+
+public final class FileSystemFactoryImpl implements FileSystemFactory {
+
+ private final org.apache.hadoop.hdfs.HdfsConfiguration conf;
+ private final HdfsConfiguration configuration;
+
+ public FileSystemFactoryImpl(HdfsConfiguration configuration) {
+ this.conf = new org.apache.hadoop.hdfs.HdfsConfiguration();
+ this.configuration = configuration;
+ }
+
+ public FileSystem create(boolean initializeUGI) throws IOException {
+ FileSystem fs;
+ if ("kerberos".equals(configuration.hadoopSecurityAuthentication())) {
+ // Initializing the FileSystem with kerberos.
+ String hdfsuri = configuration.hdfsUri(); // Get from config.
+ // set kerberos host and realm
+ System.setProperty("java.security.krb5.realm", configuration.javaSecurityKrb5Realm());
+ System.setProperty("java.security.krb5.kdc", configuration.javaSecurityKrb5Kdc());
+ conf.clear();
+ // enable kerberus
+ conf.set("hadoop.security.authentication", configuration.hadoopSecurityAuthentication());
+ conf.set("hadoop.security.authorization", configuration.hadoopSecurityAuthorization());
+ conf
+ .set(
+ "hadoop.kerberos.keytab.login.autorenewal.enabled",
+ configuration.hadoopKerberosKeytabLoginAutorenewalEnabled()
+ );
+ conf.set("fs.defaultFS", hdfsuri); // Set FileSystem URI
+ conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); // Maven stuff?
+ conf.set("fs.file.impl", LocalFileSystem.class.getName()); // Maven stuff?
+ /* hack for running locally with fake DNS records
+ set this to true if overriding the host name in /etc/hosts*/
+ conf.set("dfs.client.use.datanode.hostname", configuration.dfsClientUseDatanodeHostname());
+ /* server principal
+ the kerberos principle that the namenode is using*/
+ conf.set("dfs.namenode.kerberos.principal.pattern", configuration.dfsNamenodeKerberosPrincipalPattern());
+ // set sasl
+ conf.set("dfs.data.transfer.protection", configuration.dfsDataTransferProtection());
+ conf.set("dfs.encrypt.data.transfer.cipher.suites", configuration.dfsEncryptDataTransferCipherSuites());
+ if (initializeUGI) {
+ UserGroupInformation.setConfiguration(conf);
+ UserGroupInformation
+ .loginUserFromKeytab(configuration.KerberosKeytabUser(), configuration.KerberosKeytabPath());
+ }
+ // filesystem for HDFS access is set here
+ fs = FileSystem.get(conf);
+ }
+ else {
+ // Initializing the FileSystem with minicluster.
+ String hdfsuri = configuration.hdfsUri();
+ // ====== Init HDFS File System Object
+ conf.clear();
+ // Set FileSystem URI
+ conf.set("fs.defaultFS", hdfsuri);
+ // Because of Maven
+ conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+ // Set HADOOP user
+ System.setProperty("HADOOP_USER_NAME", "hdfs");
+ System.setProperty("hadoop.home.dir", "/");
+ //Get the filesystem - HDFS
+ fs = FileSystem.get(URI.create(hdfsuri), conf);
+ }
+ return fs;
+ }
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java
index 7a3fddb6..137a6803 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSPrune.java
@@ -45,7 +45,7 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-import com.teragrep.cfe_39.Config;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -54,16 +54,16 @@
import java.io.IOException;
-public class HDFSPrune {
+public final class HDFSPrune {
private static final Logger LOGGER = LoggerFactory.getLogger(HDFSPrune.class);
private final FileSystem fs;
private final Path newDirectoryPath;
private final long cutOffEpoch;
- public HDFSPrune(Config config, String topicName, FileSystem fs) throws IOException {
+ public HDFSPrune(HdfsConfiguration config, String topicName, FileSystem fs) throws IOException {
this.fs = fs;
- String path = config.getHdfsPath().concat("/").concat(topicName);
+ String path = config.hdfsPath().concat("/").concat(topicName);
//==== Create directory if not exists
Path workingDir = fs.getWorkingDirectory();
newDirectoryPath = new Path(path);
@@ -72,7 +72,7 @@ public HDFSPrune(Config config, String topicName, FileSystem fs) throws IOExcept
fs.mkdirs(newDirectoryPath);
LOGGER.info("Path <{}> created.", path);
}
- long pruneOffset = config.getPruneOffset();
+ long pruneOffset = config.pruneOffset();
cutOffEpoch = System.currentTimeMillis() - pruneOffset; // pruneOffset is parametrized in Config.java. Default value is 2 days in milliseconds.
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java
index bf1f92f6..616ec120 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSRead.java
@@ -45,7 +45,7 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-import com.teragrep.cfe_39.Config;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import org.apache.hadoop.fs.*;
import org.apache.kafka.common.TopicPartition;
import org.slf4j.Logger;
@@ -64,9 +64,9 @@ The offset map can then be used for kafka consumer seek() method, which will add
private final FileSystem fs;
private final String path;
- public HDFSRead(Config config, FileSystem fs) throws IOException {
+ public HDFSRead(HdfsConfiguration config, FileSystem fs) throws IOException {
this.fs = fs;
- path = config.getHdfsPath();
+ path = config.hdfsPath();
}
public Map hdfsStartOffsets() throws IOException {
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java
index c949ee81..0475ca22 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HDFSWrite.java
@@ -45,142 +45,63 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-import com.google.gson.JsonObject;
-import com.teragrep.cfe_39.Config;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import org.apache.hadoop.fs.*;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
-import java.net.URI;
-import java.util.Properties;
-public class HDFSWrite implements AutoCloseable {
+public final class HDFSWrite implements AutoCloseable {
private static final Logger LOGGER = LoggerFactory.getLogger(HDFSWrite.class);
private final String fileName;
private final String path;
- private final FileSystem fs;
- private final boolean useMockKafkaConsumer; // Defines if mock HDFS database is used for testing
- private final HdfsConfiguration conf;
- private final String hdfsuri;
+ private final HdfsConfiguration configuration;
- public HDFSWrite(Config config, JsonObject lastObjectJo) throws IOException {
-
- Properties readerKafkaProperties = config.getKafkaConsumerProperties();
- this.useMockKafkaConsumer = Boolean
- .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false"));
-
- if (useMockKafkaConsumer) {
- // Code for initializing the class for mock hdfs database usage without kerberos.
- hdfsuri = config.getHdfsuri();
-
- /* The filepath should be something like hdfs:///opt/teragrep/cfe_39/srv/topic_name/0.12345 where 12345 is offset and 0 the partition.
- In other words the directory named topic_name holds files that are named and arranged based on partition and the partition's offset. Every partition has its own set of unique offset values.
- These values should be fetched from config and other input parameters (topic+partition+offset).*/
- path = config.getHdfsPath() + "/" + lastObjectJo.get("topic").getAsString();
- fileName = lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString(); // filename should be constructed from partition and offset.
-
- // ====== Init HDFS File System Object
- conf = new HdfsConfiguration();
- // Set FileSystem URI
- conf.set("fs.defaultFS", hdfsuri);
- // Because of Maven
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", LocalFileSystem.class.getName());
- // Set HADOOP user here.
- System.setProperty("HADOOP_USER_NAME", "hdfs");
- System.setProperty("hadoop.home.dir", "/");
- // filesystem for HDFS access is set here
- try {
- fs = FileSystem.get(URI.create(hdfsuri), conf);
- }
- catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- }
- else {
- // Code for initializing the class for kerberized HDFS database usage.
- hdfsuri = config.getHdfsuri();
-
- path = config.getHdfsPath() + "/" + lastObjectJo.get("topic").getAsString();
- fileName = lastObjectJo.get("partition").getAsString() + "." + lastObjectJo.get("offset").getAsString();
-
- // set kerberos host and realm
- System.setProperty("java.security.krb5.realm", config.getKerberosRealm());
- System.setProperty("java.security.krb5.kdc", config.getKerberosHost());
-
- conf = new HdfsConfiguration();
-
- // enable kerberus
- conf.set("hadoop.security.authentication", config.getHadoopAuthentication());
- conf.set("hadoop.security.authorization", config.getHadoopAuthorization());
- conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", config.getKerberosLoginAutorenewal());
-
- conf.set("fs.defaultFS", hdfsuri); // Set FileSystem URI
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); // Maven stuff?
- conf.set("fs.file.impl", LocalFileSystem.class.getName()); // Maven stuff?
-
- // hack for running locally with fake DNS records, set this to true if overriding the host name in /etc/hosts
- conf.set("dfs.client.use.datanode.hostname", config.getKerberosTestMode());
-
- // server principal, the kerberos principle that the namenode is using
- conf.set("dfs.namenode.kerberos.principal.pattern", config.getKerberosPrincipal());
-
- // set sasl
- conf.set("dfs.data.transfer.protection", config.getDfsDataTransferProtection());
- conf.set("dfs.encrypt.data.transfer.cipher.suites", config.getDfsEncryptDataTransferCipherSuites());
-
- // filesystem for HDFS access is set here
- fs = FileSystem.get(conf);
- }
+ public HDFSWrite(HdfsConfiguration config, String topic, String partition, long offset) {
+ this.configuration = config;
+ path = config.hdfsPath() + "/" + topic;
+ fileName = partition + "." + offset; // filename should be constructed from partition and offset.
}
// Method for committing the AVRO-file to HDFS
- public void commit(File syslogFile) {
+ public void commit(File syslogFile) throws IOException {
// The code for writing the file to HDFS should be same for both test (non-kerberized access) and prod (kerberized access).
- try {
- //==== Create directory if not exists
- Path workingDir = fs.getWorkingDirectory();
- // Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
- Path newDirectoryPath = new Path(path);
- if (!fs.exists(newDirectoryPath)) {
- // Create new Directory
- fs.mkdirs(newDirectoryPath);
- LOGGER.info("Path <{}> created.", path);
- }
-
- //==== Write file
- LOGGER.debug("Begin Write file into hdfs");
- //Create a path
- Path hdfswritepath = new Path(newDirectoryPath.toString() + "/" + fileName); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset.
- if (fs.exists(hdfswritepath)) {
- LOGGER
- .debug(
- "Deleting the seemingly duplicate source file {} because target file {} already exists in HDFS",
- syslogFile.getPath(), hdfswritepath
- );
- syslogFile.delete();
- throw new RuntimeException("File " + fileName + " already exists");
- }
- else {
- LOGGER.debug("Target file <{}> doesn't exist, proceeding normally.", hdfswritepath);
- }
-
- Path path = new Path(syslogFile.getPath());
- fs.copyFromLocalFile(path, hdfswritepath);
- LOGGER.debug("End Write file into hdfs");
- boolean delete = syslogFile.delete(); // deletes the avro-file from the local disk now that it has been committed to HDFS.
- LOGGER.info("\nFile committed to HDFS, file writepath should be: <{}>\n", hdfswritepath);
+ FileSystemFactoryImpl fileSystemFactoryImpl = new FileSystemFactoryImpl(configuration);
+ FileSystem fs = fileSystemFactoryImpl.create(false);
+ //==== Create directory if not exists
+ Path workingDir = fs.getWorkingDirectory();
+ // Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
+ Path newDirectoryPath = new Path(path);
+ if (!fs.exists(newDirectoryPath)) {
+ // Create new Directory
+ fs.mkdirs(newDirectoryPath);
+ LOGGER.info("Path <{}> created.", path);
+ }
+ //==== Write file
+ LOGGER.debug("Begin Write file into hdfs");
+ //Create a path
+ Path hdfswritepath = new Path(newDirectoryPath.toString() + "/" + fileName); // filename should be set according to the requirements: 0.12345 where 0 is Kafka partition and 12345 is Kafka offset.
+ if (fs.exists(hdfswritepath)) {
+ LOGGER
+ .debug(
+ "Deleting the seemingly duplicate source file {} because target file {} already exists in HDFS",
+ syslogFile.getPath(), hdfswritepath
+ );
+ syslogFile.delete();
+ throw new RuntimeException("File " + fileName + " already exists");
}
- catch (IOException e) {
- throw new RuntimeException(e);
+ else {
+ LOGGER.debug("Target file <{}> doesn't exist, proceeding normally.", hdfswritepath);
}
+
+ Path filePath = new Path(syslogFile.getPath());
+ fs.copyFromLocalFile(filePath, hdfswritepath);
+ LOGGER.debug("End Write file into hdfs");
+ LOGGER.info("\nFile committed to HDFS, file writepath should be: <{}>\n", hdfswritepath);
}
// try-with-resources handles closing the filesystem automatically.
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java
index 305035bd..19c158be 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/HdfsDataIngestion.java
@@ -45,13 +45,12 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
-import com.teragrep.cfe_39.Config;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.metrics.*;
import com.teragrep.cfe_39.metrics.topic.TopicCounter;
import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.PartitionInfo;
@@ -61,86 +60,57 @@
import org.slf4j.LoggerFactory;
import java.io.IOException;
-import java.net.URI;
import java.sql.SQLException;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
// Ingests data for HDFS database, periodically scans kafka for new topics based on config.getQueueTopicPattern() and creates kafka topic consumer groups for the new topics that will store the records to HDFS.
-public class HdfsDataIngestion {
+public final class HdfsDataIngestion {
private static final Logger LOGGER = LoggerFactory.getLogger(HdfsDataIngestion.class);
- private final Config config;
+ private final CommonConfiguration config;
+ private final HdfsConfiguration hdfsConfig;
+ private final KafkaConfiguration kafkaConfig;
private final org.apache.kafka.clients.consumer.Consumer kafkaConsumer;
private final List threads = new ArrayList<>();
private final Set activeTopics = new HashSet<>();
- private boolean keepRunning;
- private boolean useMockKafkaConsumer;
+ private final boolean useMockKafkaConsumer;
private final int numOfConsumers;
- private Map hdfsStartOffsets;
- private final FileSystem fs;
+ private final Map hdfsStartOffsets;
- public HdfsDataIngestion(Config config) throws IOException {
- keepRunning = true;
+ public HdfsDataIngestion(
+ CommonConfiguration config,
+ HdfsConfiguration hdfsConfiguration,
+ KafkaConfiguration kafkaConfiguration
+ ) throws IOException {
this.config = config;
- Properties readerKafkaProperties = config.getKafkaConsumerProperties();
- this.numOfConsumers = config.getNumOfConsumers();
- this.useMockKafkaConsumer = Boolean
- .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false"));
+ this.hdfsConfig = hdfsConfiguration;
+ this.kafkaConfig = kafkaConfiguration;
+ this.numOfConsumers = kafkaConfig.numOfConsumers();
+ this.useMockKafkaConsumer = kafkaConfiguration.useMockKafkaConsumer();
if (useMockKafkaConsumer) {
- this.kafkaConsumer = MockKafkaConsumerFactory.getConsumer(0); // A consumer used only for scanning the available topics to be allocated to consumers running in different threads (thus 0 as input parameter).
- // Initializing the FileSystem with minicluster.
- String hdfsuri = config.getHdfsuri();
- // ====== Init HDFS File System Object
- HdfsConfiguration conf = new HdfsConfiguration();
- // Set FileSystem URI
- conf.set("fs.defaultFS", hdfsuri);
- // Because of Maven
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
- // Set HADOOP user
- System.setProperty("HADOOP_USER_NAME", "hdfs");
- System.setProperty("hadoop.home.dir", "/");
- //Get the filesystem - HDFS
- fs = FileSystem.get(URI.create(hdfsuri), conf);
+ this.kafkaConsumer = new MockKafkaConsumerFactory(0).getConsumer(); // A consumer used only for scanning the available topics to be allocated to consumers running in different threads (thus 0 as input parameter).
}
else {
+ Properties kafkaProperties = new Properties();
+ kafkaProperties.put("bootstrap.servers", kafkaConfiguration.bootstrapServers());
+ kafkaProperties.put("auto.offset.reset", kafkaConfiguration.autoOffsetReset());
+ kafkaProperties.put("enable.auto.commit", kafkaConfiguration.enableAutoCommit());
+ kafkaProperties.put("group.id", kafkaConfiguration.groupId());
+ kafkaProperties.put("security.protocol", kafkaConfiguration.securityProtocol());
+ kafkaProperties.put("sasl.mechanism", kafkaConfiguration.saslMechanism());
+ kafkaProperties.put("max.poll.records", kafkaConfiguration.maxPollRecords());
+ kafkaProperties.put("fetch.max.bytes", kafkaConfiguration.fetchMaxBytes());
+ kafkaProperties.put("request.timeout.ms", kafkaConfiguration.requestTimeoutMs());
+ kafkaProperties.put("max.poll.interval.ms", kafkaConfiguration.maxPollIntervalMs());
this.kafkaConsumer = new KafkaConsumer<>(
- config.getKafkaConsumerProperties(),
+ kafkaProperties,
new ByteArrayDeserializer(),
new ByteArrayDeserializer()
);
- // Initializing the FileSystem with kerberos.
- String hdfsuri = config.getHdfsuri(); // Get from config.
- // set kerberos host and realm
- System.setProperty("java.security.krb5.realm", config.getKerberosRealm());
- System.setProperty("java.security.krb5.kdc", config.getKerberosHost());
- HdfsConfiguration conf = new HdfsConfiguration();
- // enable kerberus
- conf.set("hadoop.security.authentication", config.getHadoopAuthentication());
- conf.set("hadoop.security.authorization", config.getHadoopAuthorization());
- conf.set("hadoop.kerberos.keytab.login.autorenewal.enabled", config.getKerberosLoginAutorenewal());
- conf.set("fs.defaultFS", hdfsuri); // Set FileSystem URI
- conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); // Maven stuff?
- conf.set("fs.file.impl", LocalFileSystem.class.getName()); // Maven stuff?
- /* hack for running locally with fake DNS records
- set this to true if overriding the host name in /etc/hosts*/
- conf.set("dfs.client.use.datanode.hostname", config.getKerberosTestMode());
- /* server principal
- the kerberos principle that the namenode is using*/
- conf.set("dfs.namenode.kerberos.principal.pattern", config.getKerberosPrincipal());
- // set sasl
- conf.set("dfs.data.transfer.protection", config.getDfsDataTransferProtection());
- conf.set("dfs.encrypt.data.transfer.cipher.suites", config.getDfsEncryptDataTransferCipherSuites());
- // set usergroup stuff
- UserGroupInformation.setConfiguration(conf);
- UserGroupInformation.loginUserFromKeytab(config.getKerberosKeytabUser(), config.getKerberosKeytabPath());
- // filesystem for HDFS access is set here
- fs = FileSystem.get(conf);
}
hdfsStartOffsets = new HashMap<>();
}
@@ -154,17 +124,23 @@ public void run() throws InterruptedException, IOException {
// register per topic counting
List topicCounters = new CopyOnWriteArrayList<>();
+ // Initialize FileSystem
+ FileSystemFactoryImpl fileSystemFactoryImpl = new FileSystemFactoryImpl(hdfsConfig);
+ FileSystem fs = fileSystemFactoryImpl.create(true);
+
// Generates offsets of the already committed records for Kafka and passes them to the kafka consumers.
- try (HDFSRead hr = new HDFSRead(config, fs)) {
- hdfsStartOffsets = hr.hdfsStartOffsets();
+ try (HDFSRead hr = new HDFSRead(hdfsConfig, fs)) {
+ hdfsStartOffsets.clear();
+ hdfsStartOffsets.putAll(hr.hdfsStartOffsets());
LOGGER.debug("topicPartitionStartMap generated succesfully: <{}>", hdfsStartOffsets);
}
catch (IOException e) {
throw new RuntimeException(e);
}
+ boolean keepRunning = true;
while (keepRunning) {
- if ("kerberos".equals(config.getHadoopAuthentication())) {
+ if ("kerberos".equals(hdfsConfig.hadoopSecurityAuthentication())) {
UserGroupInformation.getLoginUser().checkTGTAndReloginFromKeytab();
}
LOGGER.debug("Scanning for threads");
@@ -178,7 +154,7 @@ public void run() throws InterruptedException, IOException {
LOGGER.info("topic that is being pruned: <{}>", topic_name);
if (topic_name != null) {
try {
- HDFSPrune hdfsPrune = new HDFSPrune(config, topic_name, fs);
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, topic_name, fs);
hdfsPrune.prune();
}
catch (IOException e) {
@@ -210,17 +186,19 @@ private void createReader(
/* Every consumer is run in a separate thread.
Consumer group is also handled here, and each consumer of the group runs on separate thread.*/
- int numOfThreads = Math.min(numOfConsumers, listPartitionInfo.size()); // Makes sure that there aren't more consumers than available partitions in the consumer group.
- for (int threadId = 1; numOfThreads >= threadId; threadId++) {
- Consumer> output = new DatabaseOutput(
+ for (int threadId = 1; numOfConsumers >= threadId; threadId++) {
+ BatchDistributionImpl output = new BatchDistributionImpl(
config, // Configuration settings
+ hdfsConfig,
topic, // String, the name of the topic
durationStatistics, // RuntimeStatistics object from metrics
topicCounter // TopicCounter object from metrics
);
ReadCoordinator readCoordinator = new ReadCoordinator(
topic,
- config.getKafkaConsumerProperties(),
+ config,
+ kafkaConfig,
+ hdfsConfig,
output,
hdfsStartOffsets
);
@@ -233,7 +211,7 @@ private void createReader(
private void topicScan(DurationStatistics durationStatistics, List topicCounters) {
Map> listTopics = kafkaConsumer.listTopics(Duration.ofSeconds(60));
- Pattern topicsRegex = Pattern.compile(config.getQueueTopicPattern());
+ Pattern topicsRegex = Pattern.compile(config.queueTopicPattern());
// Find the topics available in Kafka based on given QueueTopicPattern, both active and in-active.
Set foundTopics = new HashSet<>();
Map> foundPartitions = new HashMap<>();
@@ -245,7 +223,7 @@ private void topicScan(DurationStatistics durationStatistics, List
}
}
if (foundTopics.isEmpty()) {
- throw new IllegalStateException("Pattern <[" + config.getQueueTopicPattern() + "]> found no topics.");
+ throw new IllegalStateException("Pattern <[" + config.queueTopicPattern() + "]> found no topics.");
}
// subtract currently active topics from found topics
foundTopics.removeAll(activeTopics);
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaAsSyslogRecord.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaAsSyslogRecord.java
new file mode 100644
index 00000000..4a29c019
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaAsSyslogRecord.java
@@ -0,0 +1,210 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.rlo_06.*;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.time.Instant;
+import java.time.ZonedDateTime;
+
+public final class KafkaAsSyslogRecord {
+
+ private final SDVector eventNodeSourceSource;
+ private final SDVector eventNodeRelaySource;
+ private final SDVector eventNodeSourceSourceModule;
+ private final SDVector eventNodeRelaySourceModule;
+ private final SDVector eventNodeSourceHostname;
+ private final SDVector eventNodeRelayHostname;
+
+ private final SDVector teragrepStreamName;
+ private final SDVector teragrepDirectory;
+
+ // Origin
+ private final SDVector originHostname;
+
+ private final RFC5424Frame rfc5424Frame;
+
+ private final ByteBuffer sourceConcatenationBuffer;
+
+ public KafkaAsSyslogRecord() {
+ this.eventNodeSourceSource = new SDVector("event_node_source@48577", "source");
+ this.eventNodeRelaySource = new SDVector("event_node_relay@48577", "source");
+ this.eventNodeSourceSourceModule = new SDVector("event_node_source@48577", "source_module");
+ this.eventNodeRelaySourceModule = new SDVector("event_node_relay@48577", "source_module");
+ this.eventNodeSourceHostname = new SDVector("event_node_source@48577", "hostname");
+ this.eventNodeRelayHostname = new SDVector("event_node_relay@48577", "hostname");
+
+ this.teragrepStreamName = new SDVector("teragrep@48577", "streamname");
+ this.teragrepDirectory = new SDVector("teragrep@48577", "directory");
+
+ // Origin
+ this.originHostname = new SDVector("origin@48577", "hostname");
+
+ this.rfc5424Frame = new RFC5424Frame();
+
+ this.sourceConcatenationBuffer = ByteBuffer.allocateDirect(256 * 1024);
+ }
+
+ private long rfc3339ToEpoch(ZonedDateTime zonedDateTime) {
+ final Instant instant = zonedDateTime.toInstant();
+
+ final long MICROS_PER_SECOND = 1000L * 1000L;
+ final long NANOS_PER_MICROS = 1000L;
+ final long sec = Math.multiplyExact(instant.getEpochSecond(), MICROS_PER_SECOND);
+
+ return Math.addExact(sec, instant.getNano() / NANOS_PER_MICROS);
+ }
+
+ public SyslogRecord toSyslogRecord(InputStream inputStream, String partition, long offset) {
+ rfc5424Frame.load(inputStream);
+ try {
+ rfc5424Frame.next();
+ }
+ catch (IOException ioException) {
+ throw new UncheckedIOException(ioException);
+ }
+
+ final long epochMicros = rfc3339ToEpoch(new RFC5424Timestamp(rfc5424Frame.timestamp).toZonedDateTime());
+
+ // input
+ final byte[] source = eventToSource();
+
+ // origin
+ final byte[] origin = eventToOrigin();
+
+ return SyslogRecord
+ .newBuilder()
+ .setTimestamp(epochMicros)
+ .setPayload(rfc5424Frame.msg.toString())
+ .setDirectory(rfc5424Frame.structuredData.getValue(teragrepDirectory).toString())
+ .setStream(rfc5424Frame.structuredData.getValue(teragrepStreamName).toString())
+ .setHost(rfc5424Frame.hostname.toString())
+ .setInput(new String(source, StandardCharsets.UTF_8))
+ .setPartition(String.valueOf(partition))
+ .setOffset(offset)
+ .setOrigin(new String(origin, StandardCharsets.UTF_8))
+ .build();
+ }
+
+ private byte[] eventToOrigin() {
+ byte[] origin;
+ Fragment originFragment = rfc5424Frame.structuredData.getValue(originHostname);
+ if (!originFragment.isStub) {
+ origin = originFragment.toBytes();
+ }
+ else {
+ origin = new byte[] {};
+ }
+ return origin;
+ }
+
+ private byte[] eventToSource() {
+ //input is produced from SD element event_node_source@48577 by
+ // concatenating "source_module:hostname:source". in case
+ //if event_node_source@48577 is not available use event_node_relay@48577.
+ //If neither are present, use null value.
+
+ sourceConcatenationBuffer.clear();
+
+ Fragment sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSourceModule);
+ if (sourceModuleFragment.isStub) {
+ sourceModuleFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySourceModule);
+ }
+
+ byte[] source_module;
+ if (!sourceModuleFragment.isStub) {
+ source_module = sourceModuleFragment.toBytes();
+ }
+ else {
+ source_module = new byte[] {};
+ }
+
+ Fragment sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceHostname);
+ if (sourceHostnameFragment.isStub) {
+ sourceHostnameFragment = rfc5424Frame.structuredData.getValue(eventNodeRelayHostname);
+ }
+
+ byte[] source_hostname;
+ if (!sourceHostnameFragment.isStub) {
+ source_hostname = sourceHostnameFragment.toBytes();
+ }
+ else {
+ source_hostname = new byte[] {};
+ }
+
+ Fragment sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeSourceSource);
+ if (sourceHostnameFragment.isStub) {
+ sourceSourceFragment = rfc5424Frame.structuredData.getValue(eventNodeRelaySource);
+ }
+
+ byte[] source_source;
+ if (!sourceSourceFragment.isStub) {
+ source_source = sourceSourceFragment.toBytes();
+ }
+ else {
+ source_source = new byte[] {};
+ }
+
+ // source_module:hostname:source"
+ sourceConcatenationBuffer.put(source_module);
+ sourceConcatenationBuffer.put((byte) ':');
+ sourceConcatenationBuffer.put(source_hostname);
+ sourceConcatenationBuffer.put((byte) ':');
+ sourceConcatenationBuffer.put(source_source);
+
+ sourceConcatenationBuffer.flip();
+ byte[] input = new byte[sourceConcatenationBuffer.remaining()];
+ sourceConcatenationBuffer.get(input);
+
+ return input;
+ }
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java
index a60d9d25..c49453d2 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaReader.java
@@ -45,31 +45,41 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
import org.apache.kafka.clients.consumer.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
+import java.time.Instant;
import java.util.*;
-public class KafkaReader implements AutoCloseable {
+public final class KafkaReader implements AutoCloseable {
- final Logger LOGGER = LoggerFactory.getLogger(KafkaReader.class);
- private Iterator> kafkaRecordsIterator = Collections.emptyIterator();
+ private final Logger LOGGER = LoggerFactory.getLogger(KafkaReader.class);
+
+ private final CommonConfiguration config;
private final Consumer kafkaConsumer;
- private final java.util.function.Consumer> callbackFunction;
+ private final BatchDistributionImpl callbackFunction;
+ private final ConsumerRebalanceListenerImpl consumerRebalanceListenerImpl;
+ private long lastTimeCalled;
public KafkaReader(
Consumer kafkaConsumer,
- java.util.function.Consumer> callbackFunction
+ BatchDistributionImpl callbackFunction,
+ ConsumerRebalanceListenerImpl consumerRebalanceListenerImpl,
+ CommonConfiguration config
) {
this.kafkaConsumer = kafkaConsumer;
this.callbackFunction = callbackFunction;
+ this.consumerRebalanceListenerImpl = consumerRebalanceListenerImpl;
+ this.config = config;
+ this.lastTimeCalled = Instant.now().toEpochMilli();
}
public void read() {
- long offset;
+ Iterator> kafkaRecordsIterator = Collections.emptyIterator();
if (!kafkaRecordsIterator.hasNext()) {
// still need to consume more, infinitely loop because connection problems may cause return of an empty iterator
ConsumerRecords kafkaRecords = kafkaConsumer.poll(Duration.ofSeconds(60));
@@ -79,21 +89,31 @@ public void read() {
kafkaRecordsIterator = kafkaRecords.iterator();
}
- List recordOffsetObjectList = new ArrayList<>();
+ List recordOffsetObjectList = new ArrayList<>();
while (kafkaRecordsIterator.hasNext()) {
ConsumerRecord record = kafkaRecordsIterator.next();
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("adding from offset: <{}>", record.offset());
}
recordOffsetObjectList
- .add(new RecordOffset(record.topic(), record.partition(), record.offset(), record.value()));
+ .add(new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value()));
}
if (!recordOffsetObjectList.isEmpty()) {
- /* This is the DatabaseOutput.accept() function.
- Offset and other required data for HDFS storage are added to the input parameters of the accept() function which processes the consumed record.*/
+ /* This is the BatchDistributionImpl.accept() function.
+ KafkaRecord and other required data for HDFS storage are added to the input parameters of the accept() function which processes the consumed record.*/
callbackFunction.accept(recordOffsetObjectList);
- kafkaConsumer.commitSync();
+ kafkaConsumer.commitAsync();
+ lastTimeCalled = Instant.now().toEpochMilli();
+ }
+ else {
+ // If no new kafka record batches is received for a while, use callbackFunction.accept() with empty recordOffsetObjectList to flush records that have already been committed in kafka to HDFS.
+ long thisTime = Instant.now().toEpochMilli();
+ long ftook = thisTime - lastTimeCalled;
+ if (ftook > config.consumerTimeout()) {
+ callbackFunction.accept(recordOffsetObjectList);
+ lastTimeCalled = Instant.now().toEpochMilli();
+ }
}
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecord.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecord.java
new file mode 100644
index 00000000..31565d24
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecord.java
@@ -0,0 +1,60 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import org.apache.kafka.common.TopicPartition;
+
+public interface KafkaRecord {
+
+ public abstract long size();
+
+ public abstract TopicPartition topicPartition();
+
+ public abstract long offset();
+
+ public abstract SyslogRecord toSyslogRecord();
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecordImpl.java
similarity index 74%
rename from src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java
rename to src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecordImpl.java
index 543a58fd..d313287f 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/RecordOffset.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/KafkaRecordImpl.java
@@ -45,15 +45,21 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import org.apache.kafka.common.TopicPartition;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
// This is the class for handling the Kafka record topic/partition/offset data that are required for HDFS storage.
-public final class RecordOffset implements Offset {
+public final class KafkaRecordImpl implements KafkaRecord {
private final String topic;
private final int partition;
private final long offset;
private final byte[] record;
- public RecordOffset(String topic, int partition, long offset, byte[] record) {
+ public KafkaRecordImpl(String topic, int partition, long offset, byte[] record) {
this.topic = topic;
this.partition = partition;
this.offset = offset;
@@ -61,18 +67,29 @@ public RecordOffset(String topic, int partition, long offset, byte[] record) {
}
@Override
- public boolean isNull() {
- return false;
+ public long size() {
+ if (record == null) {
+ return 0;
+ }
+ else {
+ return record.length;
+ }
+ }
+
+ @Override
+ public TopicPartition topicPartition() {
+ return new TopicPartition(topic, partition);
}
@Override
- public byte[] getRecord() {
- return record;
+ public long offset() {
+ return this.offset;
}
@Override
- public String offsetToJSON() {
- return String
- .format("{\"topic\":\"%s\", \"partition\":%d, \"offset\":%d}", this.topic, this.partition, this.offset);
+ public SyslogRecord toSyslogRecord() {
+ InputStream inputStream = new ByteArrayInputStream(record);
+ return new KafkaAsSyslogRecord().toSyslogRecord(inputStream, String.valueOf(partition), offset);
}
+
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java
index e5da3a81..da94c876 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/MockKafkaConsumerFactory.java
@@ -65,14 +65,16 @@
* @author Mikko Kortelainen
*/
@VisibleForTesting
-public class MockKafkaConsumerFactory {
+public final class MockKafkaConsumerFactory {
- final static private Logger LOGGER = LoggerFactory.getLogger(MockKafkaConsumerFactory.class);
+ private final Logger LOGGER = LoggerFactory.getLogger(MockKafkaConsumerFactory.class);
+ private final int threadNum;
- private MockKafkaConsumerFactory() {
+ public MockKafkaConsumerFactory(int threadNumInput) {
+ this.threadNum = threadNumInput;
}
- private static void generateEvents(MockConsumer consumer, String topicName, int partition) {
+ private void generateEvents(MockConsumer consumer, String topicName, int partition) {
consumer
.addRecord(
new ConsumerRecord<>(
@@ -252,58 +254,47 @@ private static void generateEvents(MockConsumer consumer, String
}
// Can initialize topic scan with all partitions available when the input parameter is 0. Consumer is manually assigned to specific partitions depending on the threadnum parameter. For example on threadnum 1 consumer has odd numbered partitions assigned to it and threadnum 2 has the even numbered partitions.
- public static Consumer getConsumer(int threadnum) {
+ public Consumer getConsumer() {
LOGGER.warn("useMockKafkaConsumer is set, using MockKafkaConsumer");
int amountofloops = 10; // number of loops for adding partitions/records to the mock consumer topic. Each loop adds a new partition of 14 records. 17777 loops results in file size slightly above 64M. 10 loops is sized at 36,102 bits.
final MockConsumer consumer;
consumer = new MockConsumer<>(OffsetResetStrategy.EARLIEST);
List topicPartitions = new ArrayList<>();
- LinkedHashMap beginningOffsets = new LinkedHashMap<>();
- LinkedHashMap endOffsets = new LinkedHashMap<>();
+ Map beginningOffsets = new HashMap<>();
+ Map endOffsets = new HashMap<>();
List mockPartitionInfo = new ArrayList<>();
// generate the topic partitions and metadata first
for (int i = 0; i < amountofloops; i++) {
TopicPartition topicPartition = new TopicPartition("testConsumerTopic", i);
- topicPartitions.add(topicPartition);
- beginningOffsets.put(topicPartition, 0L);
- endOffsets.put(topicPartition, 14L);
- mockPartitionInfo.add(new PartitionInfo("testConsumerTopic", i, null, null, null));
- }
-
- if (threadnum == 1) {
- List oddTopicPartitions = new ArrayList<>();
- for (TopicPartition a : topicPartitions) {
- if (((a.partition() + 1) % 2) == 0) {
- oddTopicPartitions.add(a);
- }
- }
- consumer.assign(oddTopicPartitions); // assign
- for (TopicPartition a : topicPartitions) {
- if (((a.partition() + 1) % 2) == 0) {
- generateEvents(consumer, a.topic(), a.partition());
+ if (threadNum == 1) {
+ if (((i + 1) % 2) == 0) {
+ topicPartitions.add(topicPartition);
+ beginningOffsets.put(topicPartition, 0L);
+ endOffsets.put(topicPartition, 14L);
+ mockPartitionInfo.add(new PartitionInfo("testConsumerTopic", i, null, null, null));
}
}
- }
- else if (threadnum == 2) {
- List evenTopicPartitions = new ArrayList<>();
- for (TopicPartition a : topicPartitions) {
- if (((a.partition() + 1) % 2) != 0) {
- evenTopicPartitions.add(a);
+ else if (threadNum == 2) {
+ if (((i + 1) % 2) != 0) {
+ topicPartitions.add(topicPartition);
+ beginningOffsets.put(topicPartition, 0L);
+ endOffsets.put(topicPartition, 14L);
+ mockPartitionInfo.add(new PartitionInfo("testConsumerTopic", i, null, null, null));
}
}
- consumer.assign(evenTopicPartitions); // assign
- for (TopicPartition a : topicPartitions) {
- if (((a.partition() + 1) % 2) != 0) {
- generateEvents(consumer, a.topic(), a.partition());
- }
+ else {
+ topicPartitions.add(topicPartition);
+ beginningOffsets.put(topicPartition, 0L);
+ endOffsets.put(topicPartition, 14L);
+ mockPartitionInfo.add(new PartitionInfo("testConsumerTopic", i, null, null, null));
}
}
- else {
- consumer.assign(topicPartitions); // assign
- for (TopicPartition a : topicPartitions) {
- generateEvents(consumer, a.topic(), a.partition());
- }
+
+ consumer.subscribe(Collections.singletonList("testConsumerTopic"));
+ consumer.rebalance(topicPartitions);
+ for (TopicPartition a : topicPartitions) {
+ generateEvents(consumer, a.topic(), a.partition());
}
consumer.updateBeginningOffsets(beginningOffsets);
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFile.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFile.java
new file mode 100644
index 00000000..53988665
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFile.java
@@ -0,0 +1,60 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import java.io.IOException;
+
+public interface PartitionFile {
+
+ public abstract void addRecord(KafkaRecordImpl kafkaRecord);
+
+ public abstract void commitRecords() throws IOException;
+
+ public abstract void writeToHdfsEarly() throws IOException;
+
+ public abstract void delete();
+
+}
diff --git a/src/test/java/com/teragrep/cfe_39/ConfigTest.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileFactory.java
similarity index 53%
rename from src/test/java/com/teragrep/cfe_39/ConfigTest.java
rename to src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileFactory.java
index 7fc13bf0..4a9ffe92 100644
--- a/src/test/java/com/teragrep/cfe_39/ConfigTest.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileFactory.java
@@ -43,46 +43,35 @@
* Teragrep, the applicable Commercial License may apply to this file if you as
* a licensee so wish it.
*/
-package com.teragrep.cfe_39;
+package com.teragrep.cfe_39.consumers.kafka;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.queue.UniqueFileCreated;
+import org.apache.kafka.common.TopicPartition;
-import java.util.Properties;
+import java.io.IOException;
-import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+public final class PartitionFileFactory {
-public class ConfigTest {
+ private final CommonConfiguration config;
+ private final HdfsConfiguration hdfsConfig;
- private static final Logger LOGGER = LoggerFactory.getLogger(ConfigTest.class);
-
- @Test
- public void validConfigTest() {
- assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- Config config = new Config();
- Properties readerKafkaProperties = config.getKafkaConsumerProperties();
- // Test extracting useMockKafkaConsumer value from config.
- boolean useMockKafkaConsumer = Boolean
- .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false"));
- Assertions.assertTrue(useMockKafkaConsumer);
- LOGGER.debug("useMockKafkaConsumer: {}", useMockKafkaConsumer);
- });
+ PartitionFileFactory(CommonConfiguration config, HdfsConfiguration hdfsConfig) {
+ this.config = config;
+ this.hdfsConfig = hdfsConfig;
}
- @Test
- public void brokenConfigTest() {
- // Set system properties to use the broken configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/broken.application.properties");
- // Test if the broken configuration throws the expected exception.
- Exception e = Assertions.assertThrows(Exception.class, () -> {
- Config config = new Config();
- });
- Assertions.assertEquals("hdfsuri not set", e.getMessage());
+ public PartitionFileImpl partitionFor(TopicPartition topicPartition) throws IOException {
+ UniqueFileCreated uniqueFileCreated = new UniqueFileCreated(
+ config.queueDirectory(),
+ topicPartition.topic() + topicPartition.partition()
+ );
+ return new PartitionFileImpl(
+ uniqueFileCreated.getNextWritableFile(),
+ hdfsConfig,
+ topicPartition,
+ new PartitionRecordsImpl(config)
+ );
}
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileImpl.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileImpl.java
new file mode 100644
index 00000000..f6f13e32
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionFileImpl.java
@@ -0,0 +1,179 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import org.apache.kafka.common.TopicPartition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public final class PartitionFileImpl implements PartitionFile {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(PartitionFileImpl.class);
+
+ private final TopicPartition topicPartition;
+ private final HdfsConfiguration hdfsConfig;
+ private final File syslogFile;
+ private final List batchOffsets;
+ private final PartitionRecordsImpl partitionRecords;
+
+ PartitionFileImpl(
+ File file,
+ HdfsConfiguration hdfsConfig,
+ TopicPartition topicPartition,
+ PartitionRecordsImpl partitionRecords
+ ) {
+ this(file, hdfsConfig, topicPartition, new ArrayList<>(), partitionRecords);
+ }
+
+ PartitionFileImpl(
+ File syslogFile,
+ HdfsConfiguration hdfsConfig,
+ TopicPartition topicPartition,
+ List batchOffsets,
+ PartitionRecordsImpl partitionRecords
+ ) {
+ this.syslogFile = syslogFile;
+ this.hdfsConfig = hdfsConfig;
+ this.topicPartition = topicPartition;
+ this.batchOffsets = batchOffsets;
+ this.partitionRecords = partitionRecords;
+ }
+
+ @Override
+ public void addRecord(KafkaRecordImpl kafkaRecord) {
+ partitionRecords.addRecord(kafkaRecord);
+ }
+
+ @Override
+ public void commitRecords() throws IOException {
+ List syslogRecordList = partitionRecords.toSyslogRecordList();
+ long storedOffset = 0;
+ for (SyslogRecord next : syslogRecordList) {
+ try (SyslogAvroWriter syslogAvroWriter = new SyslogAvroWriter(syslogFile)) {
+ syslogAvroWriter.write(next);
+ }
+ if (next.getOffset() > storedOffset) {
+ storedOffset = next.getOffset();
+ }
+ // When the file size has gone above the maximum, commit the file into HDFS using the latest topic/partition/offset values as the filename and then delete the local avro-file.
+ if (hdfsConfig.maximumFileSize() < syslogFile.length()) {
+ writeToHdfs(storedOffset);
+ }
+ }
+ // Store the last offset of the batch to a list.
+ if (storedOffset > 0) {
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "Kafka Batch for topic {} partition {} processed successfully. Final record offset of the batch was {}.",
+ topicPartition.topic(), topicPartition.partition(), storedOffset
+ );
+ }
+ batchOffsets.add(storedOffset);
+ }
+ // No records mean consumer group rebalance happened, write file to HDFS.
+ if (syslogRecordList.isEmpty()) {
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "Kafka Batch for topic {} partition {} was empty. Final record offset of the batch was {}. Proceeding to write the existing syslogFile to HDFS.",
+ topicPartition.topic(), topicPartition.partition(), storedOffset
+ );
+ }
+ writeToHdfsEarly();
+ }
+ }
+
+ @Override
+ public void writeToHdfsEarly() throws IOException {
+ if (!batchOffsets.isEmpty()) {
+ writeToHdfs(batchOffsets.get(batchOffsets.size() - 1));
+ }
+ }
+
+ @Override
+ public void delete() {
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "PartitionFileImpl-object representing topic {} partition {} was notified of consumer group rebalance. Deleting syslogFile allocated to the object at {}",
+ topicPartition.topic(), topicPartition.partition(), syslogFile.getPath()
+ );
+ }
+ syslogFile.delete();
+ }
+
+ // Writes the file to hdfs and initializes new file.
+ private void writeToHdfs(long offset) throws IOException {
+ try (
+ HDFSWrite writer = new HDFSWrite(
+ hdfsConfig,
+ topicPartition.topic(),
+ Integer.toString(topicPartition.partition()),
+ offset
+ )
+ ) {
+ writer.commit(syslogFile); // commits the final AVRO-file to HDFS.
+ }
+ syslogFile.delete(); // Delete the file as all the contents have been stored to HDFS.
+ batchOffsets.clear();
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER
+ .debug(
+ "SyslogFile representing topic {} partition {} stored to HDFS with offset value of {}. SyslogFile allocated to the object located at {} has been deleted to prepare for storing new records.",
+ topicPartition.topic(), topicPartition.partition(), offset, syslogFile.getPath()
+ );
+ }
+ }
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecords.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecords.java
new file mode 100644
index 00000000..b42954ce
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecords.java
@@ -0,0 +1,58 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+
+import java.util.List;
+
+public interface PartitionRecords {
+
+ public abstract void addRecord(KafkaRecordImpl kafkaRecord);
+
+ public abstract List toSyslogRecordList();
+
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecordsImpl.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecordsImpl.java
new file mode 100644
index 00000000..21907e26
--- /dev/null
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/PartitionRecordsImpl.java
@@ -0,0 +1,119 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39.consumers.kafka;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.rlo_06.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public final class PartitionRecordsImpl implements PartitionRecords {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(PartitionRecordsImpl.class);
+
+ private final List kafkaRecordList;
+ private final CommonConfiguration config;
+
+ public PartitionRecordsImpl(CommonConfiguration config) {
+ this.kafkaRecordList = new ArrayList<>();
+ this.config = config;
+ }
+
+ @Override
+ public void addRecord(KafkaRecordImpl kafkaRecord) {
+ this.kafkaRecordList.add(kafkaRecord);
+ }
+
+ @Override
+ public List toSyslogRecordList() {
+ List syslogRecordList = new ArrayList<>();
+ for (KafkaRecordImpl next : kafkaRecordList) {
+ try {
+ syslogRecordList.add(next.toSyslogRecord());
+ }
+ catch (ParseException e) {
+ if (config.skipNonRFC5424Records()) {
+ LOGGER
+ .warn(
+ "Skipping parsing a non RFC5424 record, record topic partition: <{}> offset:<{}>. Exception information: ",
+ next.topicPartition(), next.offset(), e
+ );
+ }
+ else {
+ LOGGER
+ .error(
+ "Failed to parse RFC5424 record <{}> offset:<{}>", next.topicPartition(),
+ next.offset()
+ );
+ throw new RuntimeException(e);
+ }
+ }
+ catch (NullPointerException e) {
+ if (config.skipEmptyRFC5424Records()) {
+ LOGGER
+ .warn(
+ "Skipping parsing an empty RFC5424 record, record topic partition: <{}> offset:<{}>. Exception information: ",
+ next.topicPartition(), next.offset(), e
+ );
+ }
+ else {
+ LOGGER
+ .error(
+ "Failed to parse RFC5424 record <{}> offset:<{}> because of null content",
+ next.topicPartition(), next.offset()
+ );
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ kafkaRecordList.clear();
+ return syslogRecordList;
+ }
+}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java
index 232c83bd..b6ca987d 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/ReadCoordinator.java
@@ -45,6 +45,9 @@
*/
package com.teragrep.cfe_39.consumers.kafka;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.ByteArrayDeserializer;
@@ -52,48 +55,71 @@
import org.slf4j.LoggerFactory;
import java.util.*;
-import java.util.function.Consumer;
-public class ReadCoordinator implements Runnable {
+public final class ReadCoordinator implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(ReadCoordinator.class);
private final String queueTopic;
- private final Properties readerKafkaProperties;
- private final Consumer> callbackFunction;
- private boolean run = true;
+ private final CommonConfiguration config;
+ private final HdfsConfiguration hdfsConfig;
+ private final KafkaConfiguration kafkaConfig;
+ private final BatchDistributionImpl callbackFunction;
private final Map hdfsStartOffsets;
public ReadCoordinator(
String queueTopic,
- Properties readerKafkaProperties,
- Consumer> callbackFunction,
+ CommonConfiguration config,
+ KafkaConfiguration kafkaConfig,
+ HdfsConfiguration hdfsConfig,
+ BatchDistributionImpl callbackFunction,
Map hdfsStartOffsets
) {
this.queueTopic = queueTopic;
- this.readerKafkaProperties = readerKafkaProperties;
+ this.config = config;
this.callbackFunction = callbackFunction;
this.hdfsStartOffsets = hdfsStartOffsets;
+ this.kafkaConfig = kafkaConfig;
+ this.hdfsConfig = hdfsConfig;
}
private KafkaReader createKafkaReader(
Properties readerKafkaProperties,
String topic,
- Consumer> callbackFunction,
+ BatchDistributionImpl callbackFunctionInput,
boolean useMockKafkaConsumer
) {
org.apache.kafka.clients.consumer.Consumer kafkaConsumer;
+ ConsumerRebalanceListenerImpl consumerRebalanceListenerImpl;
if (useMockKafkaConsumer) { // Mock kafka consumer is enabled, create mock consumers with assigned partitions that are not overlapping with each other.
String name = Thread.currentThread().getName(); // Use thread name to identify which thread is running the code.
if (Objects.equals(name, "testConsumerTopic1")) {
- kafkaConsumer = MockKafkaConsumerFactory.getConsumer(1); // creates a Kafka MockConsumer that has the odd numbered partitions assigned to it.
+ kafkaConsumer = new MockKafkaConsumerFactory(1).getConsumer(); // creates a Kafka MockConsumer that has the odd numbered partitions assigned to it.
+ consumerRebalanceListenerImpl = new ConsumerRebalanceListenerImpl(
+ kafkaConsumer,
+ callbackFunctionInput,
+ hdfsConfig
+ );
+ kafkaConsumer.subscribe(Collections.singletonList(topic), consumerRebalanceListenerImpl);
}
else if (Objects.equals(name, "testConsumerTopic2")) {
- kafkaConsumer = MockKafkaConsumerFactory.getConsumer(2); // creates a Kafka MockConsumer that has the even numbered partitions assigned to it.
+ kafkaConsumer = new MockKafkaConsumerFactory(2).getConsumer(); // creates a Kafka MockConsumer that has the even numbered partitions assigned to it.
+ consumerRebalanceListenerImpl = new ConsumerRebalanceListenerImpl(
+ kafkaConsumer,
+ callbackFunctionInput,
+ hdfsConfig
+ );
+ kafkaConsumer.subscribe(Collections.singletonList(topic), consumerRebalanceListenerImpl);
}
else {
- kafkaConsumer = MockKafkaConsumerFactory.getConsumer(0); // Creates a single Kafka MockConsumer that has all the partitions assigned to it.
+ kafkaConsumer = new MockKafkaConsumerFactory(0).getConsumer(); // Creates a single Kafka MockConsumer that has all the partitions assigned to it.
+ consumerRebalanceListenerImpl = new ConsumerRebalanceListenerImpl(
+ kafkaConsumer,
+ callbackFunctionInput,
+ hdfsConfig
+ );
+ kafkaConsumer.subscribe(Collections.singletonList(topic), consumerRebalanceListenerImpl);
}
}
else { // Mock kafka consumer is disabled, subscribe method should handle assigning the partitions automatically to the consumer based on group id parameters of readerKafkaProperties.
@@ -102,7 +128,12 @@ else if (Objects.equals(name, "testConsumerTopic2")) {
new ByteArrayDeserializer(),
new ByteArrayDeserializer()
);
- kafkaConsumer.subscribe(Collections.singletonList(topic));
+ consumerRebalanceListenerImpl = new ConsumerRebalanceListenerImpl(
+ kafkaConsumer,
+ callbackFunctionInput,
+ hdfsConfig
+ );
+ kafkaConsumer.subscribe(Collections.singletonList(topic), consumerRebalanceListenerImpl);
}
Set assignment = kafkaConsumer.assignment();
@@ -116,20 +147,30 @@ else if (Objects.equals(name, "testConsumerTopic2")) {
}
}
- return new KafkaReader(kafkaConsumer, callbackFunction);
+ return new KafkaReader(kafkaConsumer, callbackFunctionInput, consumerRebalanceListenerImpl, config);
}
// Part or Runnable implementation, called when the thread is started.
@Override
public void run() {
- boolean useMockKafkaConsumer = Boolean
- .parseBoolean(readerKafkaProperties.getProperty("useMockKafkaConsumer", "false"));
+ boolean useMockKafkaConsumer = kafkaConfig.useMockKafkaConsumer();
+ Properties kafkaProperties = new Properties();
+ kafkaProperties.put("bootstrap.servers", kafkaConfig.bootstrapServers());
+ kafkaProperties.put("auto.offset.reset", kafkaConfig.autoOffsetReset());
+ kafkaProperties.put("enable.auto.commit", kafkaConfig.enableAutoCommit());
+ kafkaProperties.put("group.id", kafkaConfig.groupId());
+ kafkaProperties.put("security.protocol", kafkaConfig.securityProtocol());
+ kafkaProperties.put("sasl.mechanism", kafkaConfig.saslMechanism());
+ kafkaProperties.put("max.poll.records", kafkaConfig.maxPollRecords());
+ kafkaProperties.put("fetch.max.bytes", kafkaConfig.fetchMaxBytes());
+ kafkaProperties.put("request.timeout.ms", kafkaConfig.requestTimeoutMs());
+ kafkaProperties.put("max.poll.interval.ms", kafkaConfig.maxPollIntervalMs());
try (
KafkaReader kafkaReader = createKafkaReader(
- readerKafkaProperties, queueTopic, callbackFunction, useMockKafkaConsumer
+ kafkaProperties, queueTopic, callbackFunction, useMockKafkaConsumer
)
) {
- while (run) {
+ while (true) {
kafkaReader.read();
}
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java
index 3b893d41..6ad9d5ae 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/SyslogAvroWriter.java
@@ -57,23 +57,20 @@
import java.io.*;
-public class SyslogAvroWriter implements AutoCloseable {
+public final class SyslogAvroWriter implements AutoCloseable {
private static final Logger LOGGER = LoggerFactory.getLogger(SyslogAvroWriter.class);
- private final DatumWriter datumWriter = new SpecificDatumWriter<>(SyslogRecord.class);
-
+ private final DatumWriter datumWriter;
private final SyncableFileOutputStream syncableFileOutputStream;
-
- private final DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter);
+ private final DataFileWriter dataFileWriter;
public SyslogAvroWriter(File syslogFile) throws IOException {
+ datumWriter = new SpecificDatumWriter<>(SyslogRecord.class);
+ dataFileWriter = new DataFileWriter<>(datumWriter);
dataFileWriter.setCodec(CodecFactory.snappyCodec());
-
- syncableFileOutputStream = new SyncableFileOutputStream(syslogFile);
-
+ syncableFileOutputStream = new SyncableFileOutputStream(syslogFile, true);
syncableFileOutputStream.getChannel().tryLock();
-
if (syslogFile.length() == 0) {
// new file
dataFileWriter.create(SyslogRecord.getClassSchema(), syncableFileOutputStream);
@@ -98,7 +95,7 @@ public void close() throws IOException {
dataFileWriter.close();
}
- public long getFileSize() throws IOException {
+ public long fileSize() throws IOException {
return syncableFileOutputStream.getChannel().size();
}
}
diff --git a/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java b/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/UniqueFileCreated.java
similarity index 93%
rename from src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java
rename to src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/UniqueFileCreated.java
index 4db4116e..ec4b0fce 100644
--- a/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/WritableQueue.java
+++ b/src/main/java/com/teragrep/cfe_39/consumers/kafka/queue/UniqueFileCreated.java
@@ -60,14 +60,15 @@
import java.util.function.ToLongFunction;
import java.util.stream.Stream;
-public class WritableQueue {
+// UniqueFileCreated responsibility is to create a new File object that doesn't interfere with any existing files on the given directory.
+public class UniqueFileCreated {
- private static final Logger LOGGER = LoggerFactory.getLogger(WritableQueue.class);
+ private static final Logger LOGGER = LoggerFactory.getLogger(UniqueFileCreated.class);
private final Path queueDirectory;
- private String queueNamePrefix;
+ private final String queueNamePrefix;
- public WritableQueue(String queueDirectory, String queueNamePrefix) {
+ public UniqueFileCreated(String queueDirectory, String queueNamePrefix) {
this.queueDirectory = Paths.get(queueDirectory);
this.queueNamePrefix = queueNamePrefix;
if (!Files.isDirectory(this.queueDirectory)) {
@@ -108,13 +109,9 @@ public File getNextWritableFile() throws IOException {
}
}
- public void setQueueNamePrefix(String queueNamePrefix) {
- this.queueNamePrefix = queueNamePrefix;
- }
-
- private BiPredicate getFileMatcher(String queueNamePrefix) {
+ private BiPredicate getFileMatcher(String queueNamePrefixInput) {
return (path, basicFileAttributes) -> {
- if (!path.getFileName().toString().startsWith(queueNamePrefix)) {
+ if (!path.getFileName().toString().startsWith(queueNamePrefixInput)) {
return false;
}
else if (path.getFileName().toString().endsWith(".state")) {
diff --git a/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java b/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java
index 15454eaf..e2e50111 100644
--- a/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java
+++ b/src/main/java/com/teragrep/cfe_39/metrics/DurationStatistics.java
@@ -55,8 +55,8 @@
public class DurationStatistics {
- MetricRegistry metricRegistry = new MetricRegistry();
- private static final Logger LOGGER = LoggerFactory.getLogger(DurationStatistics.class);
+ private final MetricRegistry metricRegistry = new MetricRegistry();
+ private final Logger LOGGER = LoggerFactory.getLogger(DurationStatistics.class);
private Instant lastReportTime = Instant.now();
private long lastBytes = 0L;
private long lastRecords = 0L;
diff --git a/src/test/java/com/teragrep/cfe_39/BatchDistributionTest.java b/src/test/java/com/teragrep/cfe_39/BatchDistributionTest.java
new file mode 100644
index 00000000..c13c17f2
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/BatchDistributionTest.java
@@ -0,0 +1,678 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.BatchDistributionImpl;
+import com.teragrep.cfe_39.consumers.kafka.KafkaRecordImpl;
+import com.teragrep.cfe_39.metrics.DurationStatistics;
+import com.teragrep.cfe_39.metrics.topic.TopicCounter;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.file.DataFileStream;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.junit.jupiter.api.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.function.Consumer;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+// Tests for processing of consumed kafka records with skipping of broken records enabled (both null and non rfc5424).
+public class BatchDistributionTest {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(BatchDistributionTest.class);
+
+ private static MiniDFSCluster hdfsCluster;
+ private static File baseDir;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private FileSystem fs;
+
+ // Prepares known state for testing.
+ @BeforeEach
+ public void startMiniCluster() {
+ assertDoesNotThrow(() -> {
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
+ // Create a HDFS miniCluster
+ baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "kerberos");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+ });
+ }
+
+ // Teardown the minicluster
+ @AfterEach
+ public void teardownMiniCluster() {
+ assertDoesNotThrow(() -> {
+ fs.close();
+ });
+ hdfsCluster.shutdown();
+ FileUtil.fullyDelete(baseDir);
+ }
+
+ @Test
+ public void normalRecordsTest() {
+ // Initialize and register duration statistics
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+
+ // register per topic counting
+ List topicCounters = new CopyOnWriteArrayList<>();
+
+ assertDoesNotThrow(() -> {
+
+ BatchDistributionImpl output = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
+
+ List kafkaRecordList = new ArrayList<>();
+
+ ConsumerRecord record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 0L,
+ "2022-04-25T07:34:50.804Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.804Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"835bf792-91cf-44e3-976b-518330bb8fd3\" source=\"source\" unixtime=\"1650872090805\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl kafkaRecord = new KafkaRecordImpl(
+ record.topic(),
+ record.partition(),
+ record.offset(),
+ record.value()
+ );
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 1L,
+ "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 2L,
+ "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"1848d8a1-2f08-4a1e-bec4-ff9e6dd92553\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi."
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 3L,
+ "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"5e1a0398-c2a0-468d-a562-c3bb31f0f853\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi."
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 4L,
+ "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"6268c3a2-5bda-427f-acce-29416eb445f4\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi."
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 5L,
+ "2022-04-25T07:34:52.238Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.238Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"b500dcaf-1101-4000-b6b9-bfb052ddbf86\" source=\"source\" unixtime=\"1650872092238\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 6L,
+ "2022-04-25T07:34:52.239Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.239Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"05363122-51ac-4c0b-a681-f5868081f56d\" source=\"source\" unixtime=\"1650872092239\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 7L,
+ "2022-04-25T07:34:52.239Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.239Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"7bbcd843-b795-4c14-b4a1-95f5d445cbcd\" source=\"source\" unixtime=\"1650872092239\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 8L,
+ "2022-04-25T07:34:52.240Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.240Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"2bc0a9f9-237d-4656-b40a-3038aace37f0\" source=\"source\" unixtime=\"1650872092240\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 9L,
+ "2022-04-25T07:34:52.240Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.240Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"ecf61e8d-e3a7-48ef-9b73-3c5a5243d2e6\" source=\"source\" unixtime=\"1650872092240\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 10L,
+ "2022-04-25T07:34:52.241Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.241Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"bf101d5a-e816-4f51-b132-97f8e3431f8e\" source=\"source\" unixtime=\"1650872092241\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 11L,
+ "2022-04-25T07:34:52.241Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.241Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"ef94d9e9-3c44-4892-b5a6-bf361d13ff97\" source=\"source\" unixtime=\"1650872092241\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 12L,
+ "2022-04-25T07:34:52.242Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.242Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"5bce6e3d-767d-44b4-a044-6c4872f8f2b5\" source=\"source\" unixtime=\"1650872092242\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 13L,
+ "2022-04-25T07:34:52.243Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:52.243Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"3bb55ce4-0ea7-413a-b403-28b174d7ac99\" source=\"source\" unixtime=\"1650872092243\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 14L,
+ "2022-04-25T07:34:52.244Z".getBytes(StandardCharsets.UTF_8),
+ null
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 15L,
+ "2022-04-25T07:34:52.245Z".getBytes(StandardCharsets.UTF_8),
+ "12>1 2022-04-25T07:34:52.245Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"3bb55ce4-0ea7-413a-b403-28b174d7ac99\" source=\"source\" unixtime=\"1650872092243\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872092\"] 25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+
+ output.accept(kafkaRecordList);
+
+ // Assert that records 11-13 are present in local avro-file.
+
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ Assertions.assertEquals(1, files.length);
+
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader dataFileReader = new DataFileReader<>(files[0], datumReader);
+ Assertions.assertTrue(dataFileReader.hasNext());
+ SyslogRecord next = dataFileReader.next();
+ Assertions.assertEquals(11, next.getOffset());
+ Assertions.assertTrue(dataFileReader.hasNext());
+ next = dataFileReader.next();
+ Assertions.assertEquals(12, next.getOffset());
+ Assertions.assertTrue(dataFileReader.hasNext());
+ next = dataFileReader.next();
+ Assertions.assertEquals(13, next.getOffset());
+
+ // Assert that records 0-10 are present in HDFS
+
+ Assertions.assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "topicName")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.10")));
+ Path hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.10");
+ //Init input stream
+ FSDataInputStream inputStream = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader = new DataFileStream<>(
+ inputStream,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ for (int i = 0; i <= 10; i++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord syslogRecord = reader.next();
+ Assertions.assertEquals(i, syslogRecord.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+
+ // Use empty batch to flush the local files to HDFS.
+
+ List kafkaRecordListEmpty = new ArrayList<>();
+ output.accept(kafkaRecordListEmpty);
+ Assertions.assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "topicName")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.13")));
+ hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.13");
+ //Init input stream
+ FSDataInputStream inputStream2 = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader2 = new DataFileStream<>(
+ inputStream2,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ for (int i = 11; i <= 13; i++) {
+ Assertions.assertTrue(reader2.hasNext());
+ SyslogRecord syslogRecord2 = reader2.next();
+ Assertions.assertEquals(i, syslogRecord2.getOffset());
+ }
+ Assertions.assertFalse(reader2.hasNext());
+ });
+ }
+
+ @Test
+ public void skipNonRFC5424DatabaseOutputTest() {
+ // Initialize and register duration statistics
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+
+ // register per topic counting
+ List topicCounters = new CopyOnWriteArrayList<>();
+
+ assertDoesNotThrow(() -> {
+
+ Consumer> output = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
+
+ ConsumerRecord record1 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 1L,
+ "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
+ "12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl kafkaRecord1 = new KafkaRecordImpl(
+ record1.topic(),
+ record1.partition(),
+ record1.offset(),
+ record1.value()
+ );
+
+ ConsumerRecord record2 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 2L,
+ "2022-04-25T07:34:50.8067".getBytes(StandardCharsets.UTF_8),
+ "12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl kafkaRecord2 = new KafkaRecordImpl(
+ record2.topic(),
+ record2.partition(),
+ record2.offset(),
+ record2.value()
+ );
+
+ ConsumerRecord record3 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 3L,
+ "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl kafkaRecord3 = new KafkaRecordImpl(
+ record3.topic(),
+ record3.partition(),
+ record3.offset(),
+ record3.value()
+ );
+
+ List kafkaRecordList = new ArrayList<>();
+ kafkaRecordList.add(kafkaRecord1);
+ kafkaRecordList.add(kafkaRecord2);
+ kafkaRecordList.add(kafkaRecord3);
+ output.accept(kafkaRecordList);
+ output.accept(new ArrayList<>());
+ Assertions.assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "topicName")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.3")));
+ // File in hdfs does not contain any empty records.
+
+ // Assert that the file in hdfs contains the expected one record.
+
+ Path hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.3");
+ //Init input stream
+ FSDataInputStream inputStream = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader = new DataFileStream<>(
+ inputStream,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord syslogRecord = reader.next();
+ Assertions
+ .assertEquals(
+ "{\"timestamp\": 1650872090807000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"0\", \"offset\": 3, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
+ syslogRecord.toString()
+ );
+
+ Assertions.assertFalse(reader.hasNext());
+ });
+
+ }
+
+ @Test
+ public void skipNullRFC5424DatabaseOutputTest() {
+ // Initialize and register duration statistics
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+
+ // register per topic counting
+ List topicCounters = new CopyOnWriteArrayList<>();
+
+ assertDoesNotThrow(() -> {
+
+ Consumer> output = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
+
+ ConsumerRecord record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 1L,
+ "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
+ null
+ );
+ KafkaRecordImpl kafkaRecord = new KafkaRecordImpl(
+ record.topic(),
+ record.partition(),
+ record.offset(),
+ record.value()
+ );
+
+ ConsumerRecord record3 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 2L,
+ "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl kafkaRecord3 = new KafkaRecordImpl(
+ record3.topic(),
+ record3.partition(),
+ record3.offset(),
+ record3.value()
+ );
+
+ List kafkaRecordList = new ArrayList<>();
+ kafkaRecordList.add(kafkaRecord);
+ kafkaRecordList.add(kafkaRecord3);
+ output.accept(kafkaRecordList);
+ output.accept(new ArrayList<>());
+ Assertions.assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "topicName")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.2")));
+ // File in hdfs does not contain any records, but acts as a marker for kafka consumer offsets.
+
+ // Assert that the file in hdfs contains the expected zero record.
+
+ Path hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.2");
+ //Init input stream
+ FSDataInputStream inputStream = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader = new DataFileStream<>(
+ inputStream,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord syslogRecord = reader.next();
+ Assertions
+ .assertEquals(
+ "{\"timestamp\": 1650872090807000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"0\", \"offset\": 2, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
+ syslogRecord.toString()
+ );
+
+ Assertions.assertFalse(reader.hasNext());
+ });
+
+ }
+
+ @Test
+ public void skipNullAndNonRFC5424DatabaseOutputTest() {
+ // Initialize and register duration statistics
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+
+ // register per topic counting
+ List topicCounters = new CopyOnWriteArrayList<>();
+
+ assertDoesNotThrow(() -> {
+
+ Consumer> output = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
+
+ List kafkaRecordList = new ArrayList<>();
+
+ ConsumerRecord record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 1L,
+ "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
+ null
+ );
+ KafkaRecordImpl kafkaRecord = new KafkaRecordImpl(
+ record.topic(),
+ record.partition(),
+ record.offset(),
+ record.value()
+ );
+ kafkaRecordList.add(kafkaRecord);
+
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 2L,
+ "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
+ "12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+ record = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 3L,
+ "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ kafkaRecord = new KafkaRecordImpl(record.topic(), record.partition(), record.offset(), record.value());
+ kafkaRecordList.add(kafkaRecord);
+ output.accept(kafkaRecordList);
+ output.accept(new ArrayList<>());
+ Assertions.assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "topicName")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.3")));
+
+ // Assert that the file in hdfs contains the expected single record.
+
+ Path hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.3");
+ //Init input stream
+ FSDataInputStream inputStream = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader = new DataFileStream<>(
+ inputStream,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord syslogRecord = reader.next();
+ Assertions
+ .assertEquals(
+ "{\"timestamp\": 1650872090807000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"0\", \"offset\": 3, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
+ syslogRecord.toString()
+ );
+ Assertions.assertFalse(reader.hasNext());
+
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/CommonConfigurationTest.java b/src/test/java/com/teragrep/cfe_39/CommonConfigurationTest.java
new file mode 100644
index 00000000..56c9ff54
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/CommonConfigurationTest.java
@@ -0,0 +1,92 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cnf_01.PathConfiguration;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class CommonConfigurationTest {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(CommonConfigurationTest.class);
+
+ @Test
+ public void configurationTest() {
+ assertDoesNotThrow(() -> {
+ final PathConfiguration pathConfiguration = new PathConfiguration(
+ System.getProperty("user.dir") + "/src/test/resources/valid.application.properties"
+ );
+ final Map map;
+ map = pathConfiguration.asMap();
+ Assertions
+ .assertEquals(
+ "{queueTopicPattern=^testConsumerTopic-*$, queueDirectory=/opt/teragrep/cfe_39/etc/AVRO/, skipNonRFC5424Records=true, skipEmptyRFC5424Records=true, consumerTimeout=600000}",
+ map.toString()
+ );
+ CommonConfiguration commonConfig = new CommonConfiguration(map);
+
+ // Assert that printers return correct values.
+ Assertions
+ .assertEquals(System.getProperty("user.dir") + "/rpm/resources/egress.properties", commonConfig.egressConfigurationFile());
+ Assertions
+ .assertEquals(System.getProperty("user.dir") + "/rpm/resources/ingress.properties", commonConfig.ingressConfigurationFile());
+ Assertions
+ .assertEquals(System.getProperty("user.dir") + "/rpm/resources/log4j2.properties", commonConfig.log4j2ConfigurationFile());
+ Assertions.assertEquals(600000, commonConfig.consumerTimeout());
+ Assertions.assertTrue(commonConfig.skipNonRFC5424Records());
+ Assertions.assertTrue(commonConfig.skipEmptyRFC5424Records());
+ Assertions.assertEquals("/opt/teragrep/cfe_39/etc/AVRO/", commonConfig.queueDirectory());
+ Assertions.assertEquals("^testConsumerTopic-*$", commonConfig.queueTopicPattern());
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/HdfsConfigurationTest.java b/src/test/java/com/teragrep/cfe_39/HdfsConfigurationTest.java
new file mode 100644
index 00000000..560ca63d
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/HdfsConfigurationTest.java
@@ -0,0 +1,96 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cnf_01.PathConfiguration;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class HdfsConfigurationTest {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(HdfsConfigurationTest.class);
+
+ @Test
+ public void configurationTest() {
+ assertDoesNotThrow(() -> {
+ final PathConfiguration hdfsPathConfiguration = new PathConfiguration(
+ System.getProperty("user.dir") + "/src/test/resources/valid.hdfs.properties"
+ );
+ final Map hdfsMap;
+ hdfsMap = hdfsPathConfiguration.asMap();
+ Assertions
+ .assertEquals(
+ "{pruneOffset=157784760000, hdfsuri=hdfs://localhost:45937/, dfs.namenode.kerberos.principal.pattern=test, hadoop.security.authentication=kerberos, dfs.encrypt.data.transfer.cipher.suites=test, java.security.krb5.kdc=test, maximumFileSize=3000, KerberosKeytabPath=test, dfs.data.transfer.protection=test, dfs.client.use.datanode.hostname=false, hadoop.kerberos.keytab.login.autorenewal.enabled=true, KerberosKeytabUser=test, java.security.krb5.realm=test, hadoop.security.authorization=test, hdfsPath=hdfs:///opt/teragrep/cfe_39/srv/}",
+ hdfsMap.toString()
+ );
+ HdfsConfiguration hdfsConfig = new HdfsConfiguration(hdfsMap);
+
+ // Assert that printers return correct values.
+ Assertions.assertEquals(157784760000L, hdfsConfig.pruneOffset());
+ Assertions.assertEquals("hdfs://localhost:45937/", hdfsConfig.hdfsUri());
+ Assertions.assertEquals("hdfs:///opt/teragrep/cfe_39/srv/", hdfsConfig.hdfsPath());
+ Assertions.assertEquals("test", hdfsConfig.javaSecurityKrb5Kdc());
+ Assertions.assertEquals("test", hdfsConfig.javaSecurityKrb5Realm());
+ Assertions.assertEquals("kerberos", hdfsConfig.hadoopSecurityAuthentication());
+ Assertions.assertEquals("test", hdfsConfig.hadoopSecurityAuthorization());
+ Assertions.assertEquals("test", hdfsConfig.dfsNamenodeKerberosPrincipalPattern());
+ Assertions.assertEquals("test", hdfsConfig.KerberosKeytabUser());
+ Assertions.assertEquals("test", hdfsConfig.KerberosKeytabPath());
+ Assertions.assertEquals("false", hdfsConfig.dfsClientUseDatanodeHostname());
+ Assertions.assertEquals("true", hdfsConfig.hadoopKerberosKeytabLoginAutorenewalEnabled());
+ Assertions.assertEquals("test", hdfsConfig.dfsDataTransferProtection());
+ Assertions.assertEquals("test", hdfsConfig.dfsEncryptDataTransferCipherSuites());
+ Assertions.assertEquals(3000, hdfsConfig.maximumFileSize());
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/HdfsTest.java b/src/test/java/com/teragrep/cfe_39/HdfsTest.java
index 315e6dac..b84f8361 100644
--- a/src/test/java/com/teragrep/cfe_39/HdfsTest.java
+++ b/src/test/java/com/teragrep/cfe_39/HdfsTest.java
@@ -47,6 +47,8 @@
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSWrite;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +61,8 @@
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -69,21 +73,50 @@ public class HdfsTest {
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Start minicluster and initialize config.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "kerberos");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
});
}
@@ -101,12 +134,11 @@ public void teardownMiniCluster() {
public void hdfsWriteTest() {
// This test case is for testing the functionality of the HDFSWrite.java by writing pre-generated AVRO-files to the HDFS database and asserting the results are correct.
assertDoesNotThrow(() -> {
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
- // writer.commit will delete the file that is given as an input argument. Copy the mock files to another directory so the deletion can be asserted properly too.
String pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.9";
java.nio.file.Path sourceFile = Paths.get(pathname);
- java.nio.file.Path targetDir = Paths.get(config.getQueueDirectory());
+ java.nio.file.Path targetDir = Paths.get(config.queueDirectory());
java.nio.file.Path targetFile = targetDir.resolve(sourceFile.getFileName());
Assertions.assertFalse(targetFile.toFile().exists());
Files.copy(sourceFile, targetFile);
@@ -115,32 +147,35 @@ public void hdfsWriteTest() {
JsonObject recordOffsetJo = JsonParser
.parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}")
.getAsJsonObject();
- try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) {
- writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward.
+ try (HDFSWrite writer = new HDFSWrite(hdfsConfig, "testConsumerTopic", "0", 9)) {
+ writer.commit(avroFile); // commits avroFile to HDFS.
}
+ targetFile.toFile().delete(); // writer no longer handles deletion of the files
Assertions.assertFalse(targetFile.toFile().exists());
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.13";
sourceFile = Paths.get(pathname);
- targetDir = Paths.get(config.getQueueDirectory());
+ targetDir = Paths.get(config.queueDirectory());
targetFile = targetDir.resolve(sourceFile.getFileName());
Files.copy(sourceFile, targetFile);
Assertions.assertTrue(targetFile.toFile().exists());
- avroFile = new File(config.getQueueDirectory() + "/0.13");
+ avroFile = new File(config.queueDirectory() + "/0.13");
recordOffsetJo = JsonParser
.parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":13}")
.getAsJsonObject();
- try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) {
+ try (HDFSWrite writer = new HDFSWrite(hdfsConfig, "testConsumerTopic", "0", 13)) {
writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward.
}
+ targetFile.toFile().delete(); // writer no longer handles deletion of the files
Assertions.assertFalse(targetFile.toFile().exists());
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
});
}
@@ -148,44 +183,44 @@ public void hdfsWriteTest() {
public void hdfsWriteExceptionTest() {
// This test case is for testing the functionality of the HDFSWrite.java exception handling by trying to write the same file twice and asserting that the proper exception is thrown.
assertDoesNotThrow(() -> {
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
- // writer.commit will delete the source file that is given as an input argument. Copy the mock file to another directory so the deletion of the source file can be asserted properly.
String pathname = System.getProperty("user.dir") + "/src/test/resources/mockHdfsFiles/0.9";
java.nio.file.Path sourceFile = Paths.get(pathname);
- java.nio.file.Path targetDir = Paths.get(config.getQueueDirectory());
+ java.nio.file.Path targetDir = Paths.get(config.queueDirectory());
java.nio.file.Path targetFile = targetDir.resolve(sourceFile.getFileName());
Assertions.assertFalse(targetFile.toFile().exists());
Files.copy(sourceFile, targetFile);
-
Assertions.assertTrue(targetFile.toFile().exists());
File avroFile = new File(targetFile.toUri());
JsonObject recordOffsetJo = JsonParser
.parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}")
.getAsJsonObject();
- try (HDFSWrite writer = new HDFSWrite(config, recordOffsetJo)) {
- writer.commit(avroFile); // commits avroFile to HDFS and deletes avroFile afterward.
+ try (HDFSWrite writer = new HDFSWrite(hdfsConfig, "testConsumerTopic", "0", 9)) {
+ writer.commit(avroFile); // commits avroFile to HDFS.
}
+ targetFile.toFile().delete(); // writer no longer handles deletion of the files
Assertions.assertFalse(targetFile.toFile().exists());
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Files.copy(sourceFile, targetFile);
Assertions.assertTrue(targetFile.toFile().exists());
- avroFile = new File(config.getQueueDirectory() + "/0.9");
+ avroFile = new File(config.queueDirectory() + "/0.9");
recordOffsetJo = JsonParser
.parseString("{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":9}")
.getAsJsonObject();
- HDFSWrite writer = new HDFSWrite(config, recordOffsetJo);
+ HDFSWrite writer = new HDFSWrite(hdfsConfig, "testConsumerTopic", "0", 9);
File finalAvroFile = avroFile;
Exception e = Assertions.assertThrows(Exception.class, () -> writer.commit(finalAvroFile));
Assertions.assertEquals("File 0.9 already exists", e.getMessage());
writer.close();
+ targetFile.toFile().delete(); // writer no longer handles deletion of the files
Assertions.assertFalse(targetFile.toFile().exists());
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion0FilesLowSizeTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesLowSizeTest.java
new file mode 100644
index 00000000..95e49247
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesLowSizeTest.java
@@ -0,0 +1,246 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.fs.Path;
+
+import java.io.File;
+import java.net.URI;
+import java.nio.file.Files;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class Ingestion0FilesLowSizeTest {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion0FilesTest.class);
+ private static MiniDFSCluster hdfsCluster;
+ private static File baseDir;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
+ private FileSystem fs;
+
+ // Prepares known state for testing.
+ @BeforeEach
+ public void startMiniCluster() {
+ assertDoesNotThrow(() -> {
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
+ // Create a HDFS miniCluster
+ baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
+ });
+ }
+
+ // Teardown the minicluster
+ @AfterEach
+ public void teardownMiniCluster() {
+ assertDoesNotThrow(() -> {
+ fs.close();
+ });
+ hdfsCluster.shutdown();
+ FileUtil.fullyDelete(baseDir);
+ }
+
+ @DisabledIfSystemProperty(
+ named = "skipIngestionTest",
+ matches = "true"
+ )
+ @Test
+ public void ingestion0FilesLowSizeTest() {
+ /*This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion.
+ Maximum file size is set to 3,000 in the config.
+ Empty HDFS database, 140 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in two avro-files per partition based on MaximumFileSize.*/
+ assertDoesNotThrow(() -> {
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
+ hdfsDataIngestion.run();
+ });
+
+ // Assert that the kafka records were ingested correctly and the database holds the correct 140 records.
+
+ // Check that the files were properly written to HDFS.
+ String hdfsuri = hdfsConfig.hdfsUri();
+
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic";
+ // ====== Init HDFS File System Object
+ Configuration conf = new Configuration();
+ // Set FileSystem URI
+ conf.set("fs.defaultFS", hdfsuri);
+ // Because of Maven
+ conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+ // Set HADOOP user
+ System.setProperty("HADOOP_USER_NAME", "hdfs");
+ System.setProperty("hadoop.home.dir", "/");
+ //Get the filesystem - HDFS
+ assertDoesNotThrow(() -> {
+ fs = FileSystem.get(URI.create(hdfsuri), conf);
+
+ Path workingDir = fs.getWorkingDirectory();
+ Path newDirectoryPath = new Path(path);
+ Assertions.assertTrue(fs.exists(newDirectoryPath));
+
+ // Assert that the kafka records were ingested correctly and the database holds the expected 20 files.
+ FileStatus[] fileStatuses = fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"));
+
+ Assertions
+ .assertEquals(10, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "1.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "2.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "3.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "4.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "5.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "6.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "7.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "8.10")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "9.10")));
+ LOGGER.debug("All expected files present in HDFS.");
+
+ // Now Assert the files that were too small to be stored in HDFS.
+
+ List filenameList = new ArrayList<>();
+ for (int i = 0; i <= 9; i++) {
+ filenameList.add("testConsumerTopic" + i + "." + 1);
+ }
+
+ for (String fileName : filenameList) {
+
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
+
+ for (int i = 11; i <= 13; i++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(i, record.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+ reader.close();
+ avroFile.delete();
+ }
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java
index cf2fe882..ac254eef 100644
--- a/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/Ingestion0FilesTest.java
@@ -46,10 +46,13 @@
package com.teragrep.cfe_39;
import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
-import org.apache.avro.file.DataFileStream;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.junit.jupiter.api.*;
@@ -59,7 +62,6 @@
import org.apache.hadoop.fs.Path;
import java.io.File;
-import java.net.URI;
import java.nio.file.Files;
import java.util.*;
@@ -70,21 +72,68 @@ public class Ingestion0FilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion0FilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "30000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
});
}
@@ -106,55 +155,51 @@ public void teardownMiniCluster() {
public void ingestion0FilesTest() {
/*This test case is for testing the functionality of the ingestion when there are no files already present in the database before starting ingestion.
Maximum file size is set to 30,000 in the config.
- Empty HDFS database, 140 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in a single avro-file per partition.*/
+ Empty HDFS database, 160 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in a single avro-file per partition (2 skipped records per file).*/
assertDoesNotThrow(() -> {
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct.
- config.setMaximumFileSize(30000); // This parameter defines the amount of records that can fit inside a single AVRO-file.
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
- Thread.sleep(10000);
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
hdfsDataIngestion.run();
});
- // Assert that the kafka records were ingested correctly and the database holds the correct 140 records.
+ // Assert that the kafka records were ingested correctly and the database/temporary file holds the correct 140 records (20 broken records were skipped).
assertDoesNotThrow(() -> {
- String path = config.getHdfsPath() + "/" + "testConsumerTopic";
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic";
Path newDirectoryPath = new Path(path);
Assertions.assertTrue(fs.exists(newDirectoryPath));
/* This is the HDFS write path for the files:
- Path hdfswritepath = new Path(newDirectoryPath + "/" + fileName); where newDirectoryPath is config.getHdfsPath() + "/" + lastObject.topic; and filename is lastObject.partition+"."+lastObject.offset;
-
- Create the list of files to read from HDFS. Test setup is created so each of the 0-9 partitions will have 1 file with offset of 13.*/
+ Path hdfswritepath = new Path(newDirectoryPath + "/" + fileName); where newDirectoryPath is config.getHdfsPath() + "/" + lastObject.topic; and filename is lastObject.partition+"."+lastObject.offset;.*/
List filenameList = new ArrayList<>();
for (int i = 0; i <= 9; i++) {
- filenameList.add(i + "." + 13);
+ filenameList.add("testConsumerTopic" + i + "." + 1);
}
FileStatus[] fileStatuses = fs.listStatus(newDirectoryPath);
- Assertions.assertEquals(filenameList.size(), fileStatuses.length);
- for (FileStatus fileStatus : fileStatuses) {
- Assertions.assertTrue(filenameList.contains(fileStatus.getPath().getName()));
+ Assertions.assertEquals(0, fileStatuses.length);
+ LOGGER.debug("No files present in HDFS as expected as maximum file size hasn't been reached.");
+
+ // Assert that all the records are inside the temporary AVRO-files generated by PartitionFile objects during consumption.
+
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ Assertions.assertEquals(10, files.length);
+ for (File file : files) {
+ Assertions.assertTrue(filenameList.contains(file.getName()));
}
- LOGGER.debug("All expected files present in HDFS.");
int partitionCounter = 0;
for (String fileName : filenameList) {
- //==== Read files
- LOGGER.info("Read file into hdfs");
- //Create a path
- Path hdfsreadpath = new Path(newDirectoryPath + "/" + fileName); // The path should be the same that was used in writing the file to HDFS.
- //Init input stream
- FSDataInputStream inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- DataFileStream reader = new DataFileStream<>(
- inputStream,
- new SpecificDatumReader<>(SyslogRecord.class)
- );
- SyslogRecord record = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
+ SyslogRecord record = reader.next();
Assertions
.assertEquals(
"{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
@@ -295,78 +340,10 @@ record = reader.next(record);
Assertions.assertFalse(reader.hasNext());
LOGGER.info("Partition {} passed assertions.", partitionCounter);
partitionCounter++;
- inputStream.close();
+ reader.close();
+ avroFile.delete();
}
Assertions.assertEquals(10, partitionCounter);
});
}
-
- @DisabledIfSystemProperty(
- named = "skipIngestionTest",
- matches = "true"
- )
- @Test
- public void ingestion0FilesLowSizeTest() {
- /*This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion.
- Maximum file size is set to 3,000 in the config.
- Empty HDFS database, 140 records in mock kafka consumer ready for ingestion. All 14 records for each 10 topic partitions are stored in two avro-files per partition based on MaximumFileSize.*/
- assertDoesNotThrow(() -> {
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct.
- config.setMaximumFileSize(3000); // This parameter defines the amount of records that can fit inside a single AVRO-file.
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
- Thread.sleep(10000);
- hdfsDataIngestion.run();
- });
-
- // Assert that the kafka records were ingested correctly and the database holds the correct 140 records.
-
- // Check that the files were properly written to HDFS.
- String hdfsuri = config.getHdfsuri();
-
- String path = config.getHdfsPath() + "/" + "testConsumerTopic";
- // ====== Init HDFS File System Object
- Configuration conf = new Configuration();
- // Set FileSystem URI
- conf.set("fs.defaultFS", hdfsuri);
- // Because of Maven
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
- // Set HADOOP user
- System.setProperty("HADOOP_USER_NAME", "hdfs");
- System.setProperty("hadoop.home.dir", "/");
- //Get the filesystem - HDFS
- assertDoesNotThrow(() -> {
- fs = FileSystem.get(URI.create(hdfsuri), conf);
-
- Path workingDir = fs.getWorkingDirectory();
- Path newDirectoryPath = new Path(path);
- Assertions.assertTrue(fs.exists(newDirectoryPath));
-
- // Assert that the kafka records were ingested correctly and the database holds the expected 20 files.
- Assertions
- .assertEquals(20, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13")));
- LOGGER.debug("All expected files present in HDFS.");
- });
- }
}
diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java
index 79174e47..4c5e616f 100644
--- a/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java
+++ b/src/test/java/com/teragrep/cfe_39/Ingestion1Old1NewFileTest.java
@@ -45,23 +45,26 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.nio.file.Files;
-import java.util.Objects;
-import java.util.Set;
+import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -72,24 +75,71 @@ public class Ingestion1Old1NewFileTest {
private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion1Old1NewFileTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "30000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
// Inserts pre-made avro-files to HDFS where one file has new timestamp and other old, which are normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -137,39 +187,57 @@ public void teardownMiniCluster() {
@Test
public void ingestion1Old1NewFileTest() {
/* This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion.
- 14 records are inserted to HDFS database before starting ingestion, with 124/140 records in mock kafka consumer ready for ingestion.
- Partitions through 1 to 9 will have only a single file, partition 0 will have 2 files (0.9 and 0.13).
- partition 0 files are pre-made and inserted to the HDFS database with old timestamp for file 0.9 and new for 0.13.
+ 14 records are inserted to HDFS database before starting ingestion, with 126/160 records in mock kafka consumer ready for ingestion (20 broken records + 14 records already in HDFS).
+ Partitions through 1 to 9 will have a single local file each with each containing 14 records. Partition 0 will have 3 files, 0.9 and 0.13 in HDFS and one empty local file.
+ partition 0 HDFS files are pre-made and inserted to the HDFS database with old timestamp for file 0.9 and new for 0.13.
Old files are pruned from the database during ingestion topic scan loops.*/
assertDoesNotThrow(() -> {
// Assert the known starting state.
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct.
- Assertions.assertTrue((System.currentTimeMillis() - config.getPruneOffset()) > 157784760000L);
- config.setMaximumFileSize(30000);
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
- Thread.sleep(10000);
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ Assertions.assertTrue((System.currentTimeMillis() - hdfsConfig.pruneOffset()) > 157784760000L);
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
hdfsDataIngestion.run();
- // Assert that the kafka records were ingested and pruned correctly and the database holds only the expected 10 files.
+ // Assert that the kafka records were ingested and pruned correctly and the database holds only the expected 1 file.
+ Assertions
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions
+ .assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Assertions
- .assertEquals(10, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13")));
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+
+ // Assert the avro-files that were too small to be stored in HDFS.
+ String path1 = config.queueDirectory() + "/" + "testConsumerTopic0.1";
+ File avroFile1 = new File(path1);
+ Assertions.assertFalse(avroFile1.exists()); // Partition 0 avro-file shouldn't exist because there are no records left in the buffer.
+
+ List filenameList = new ArrayList<>();
+ for (int i = 1; i <= 9; i++) {
+ filenameList.add("testConsumerTopic" + i + "." + 1);
+ }
+ for (String fileName : filenameList) {
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
+ for (int i = 0; i <= 13; i++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(i, record.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+ reader.close();
+ avroFile.delete();
+ }
+
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java
index 7c8e7db8..f0a3302d 100644
--- a/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/Ingestion2NewFilesTest.java
@@ -46,27 +46,23 @@
package com.teragrep.cfe_39;
import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
-import org.apache.avro.file.DataFileStream;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
-import java.net.URI;
import java.nio.file.Files;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Objects;
-import java.util.Set;
+import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -77,24 +73,71 @@ public class Ingestion2NewFilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion2NewFilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "30000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
// Inserts pre-made avro-files with new timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -140,457 +183,52 @@ public void teardownMiniCluster() {
@Test
public void ingestion2NewFilesTest() {
/* This test case is for testing the functionality of the ingestion when there are files already present in the database before starting ingestion.
- 14 records are inserted to HDFS database before starting ingestion, with 124/140 records in mock kafka consumer ready for ingestion.
- Partitions through 1 to 9 will have only a single file, partition 0 will have 2 files (0.9 and 0.13) that are inserted to the database before starting ingestion.
+ 14 records are inserted to HDFS database before starting ingestion, with 126/160 records in mock kafka consumer ready for ingestion (20 broken records + 14 records already in HDFS).
+ Partitions through 1 to 9 will have only a single temporary avro-file that isn't stored to HDFS (size too small), partition 0 will have 2 files (0.9 and 0.13) that are inserted to the database before starting ingestion.
*/
assertDoesNotThrow(() -> {
// Assert the known starting state.
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
- Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct.
- config.setMaximumFileSize(30000);
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
- Thread.sleep(10000);
- hdfsDataIngestion.run();
-
- // Assert that the kafka records were ingested correctly and the database holds the expected 11 files holding the expected 140 records.
- Assertions
- .assertEquals(11, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13")));
- });
-
- // Check that the files were properly written to HDFS.
- String hdfsuri = config.getHdfsuri();
-
- String path = config.getHdfsPath() + "/" + "testConsumerTopic";
- // ====== Init HDFS File System Object
- Configuration conf = new Configuration();
- // Set FileSystem URI
- conf.set("fs.defaultFS", hdfsuri);
- // Because of Maven
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
- // Set HADOOP user
- System.setProperty("HADOOP_USER_NAME", "hdfs");
- System.setProperty("hadoop.home.dir", "/");
- //Get the filesystem - HDFS
- assertDoesNotThrow(() -> {
- fs = FileSystem.get(URI.create(hdfsuri), conf);
-
- Path workingDir = fs.getWorkingDirectory();
- Path newDirectoryPath = new Path(path);
- Assertions.assertTrue(fs.exists(newDirectoryPath));
-
- /* This is the HDFS write path for the files:
- Path hdfswritepath = new Path(newDirectoryPath + "/" + fileName); where newDirectoryPath is config.getHdfsPath() + "/" + lastObject.topic; and filename is lastObject.partition+"."+lastObject.offset;
-
- Create the list of files to read from HDFS. Test setup is created so each of the 1-9 partitions will have 1 file with offset of 13, while the 0th partition will have 2 files with offset 9 and 13.*/
- List filenameList = new ArrayList<>();
- filenameList.add("0.9");
- filenameList.add("0.13");
- for (int i = 1; i <= 9; i++) {
- filenameList.add(i + "." + 13);
- }
- FileStatus[] fileStatuses = fs.listStatus(newDirectoryPath);
- Assertions.assertEquals(filenameList.size(), fileStatuses.length);
- for (FileStatus fileStatus : fileStatuses) {
- Assertions.assertTrue(filenameList.contains(fileStatus.getPath().getName()));
- }
- LOGGER.info("All expected files present in HDFS.");
-
- int partitionCounter = 0;
-
- // Assertions for file testConsumerTopic/0.9
- String fileName0 = filenameList.get(0);
- Assertions.assertEquals("0.9", fileName0);
- // Assert that file testConsumerTopic/0.9 has expected content.
- LOGGER.info("Read file into hdfs");
- //Create a path
- Path hdfsreadpath = new Path(newDirectoryPath + "/" + fileName0); // The path should be the same that was used in writing the file to HDFS.
- //Init input stream
- FSDataInputStream inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- DataFileStream reader = new DataFileStream<>(
- inputStream,
- new SpecificDatumReader<>(SyslogRecord.class)
- );
- SyslogRecord record = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 0, \"origin\": \"jla-02.default\", \"payload\": \"[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090806000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 1, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 2, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 3, \"origin\": \"jla-02\", \"payload\": \"470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 4, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092238000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 5, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 6, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 7, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 8, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 9, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertFalse(reader.hasNext()); // Reached the end of the testConsumerTopic/0.9 file.
- inputStream.close();
- filenameList.remove(0);
-
- // Assertions for file testConsumerTopic/0.13
- fileName0 = filenameList.get(0);
- Assertions.assertEquals("0.13", fileName0);
- LOGGER.info("Read file into hdfs");
- //Create a path
- hdfsreadpath = new Path(newDirectoryPath + "/" + fileName0); // The path should be the same that was used in writing the file to HDFS.
- //Init input stream
- inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- reader = new DataFileStream<>(inputStream, new SpecificDatumReader<>(SyslogRecord.class));
- record = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
+ hdfsDataIngestion.run();
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
+ // Assert that the kafka records were ingested correctly and the database holds the expected 2 files.
Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 10, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 11, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]\"}",
- record.toString()
- );
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
- }
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092242000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 12, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]\"}",
- record.toString()
- );
+ // Assert the avro-files that were too small to be stored in HDFS.
+ String path1 = config.queueDirectory() + "/" + "testConsumerTopic0.1";
+ File avroFile1 = new File(path1);
+ Assertions.assertFalse(avroFile1.exists()); // Partition 0 avro-file shouldn't exist because there are no records left in the buffer.
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug(record.toString());
+ List filenameList = new ArrayList<>();
+ for (int partition = 1; partition <= 9; partition++) {
+ filenameList.add("testConsumerTopic" + partition + "." + 1);
}
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092243000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 13, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]\"}",
- record.toString()
- );
- Assertions.assertFalse(reader.hasNext()); // Reached the end of the testConsumerTopic/0.13 file.
- inputStream.close();
- filenameList.remove(0);
-
- partitionCounter++;
-
for (String fileName : filenameList) {
- //==== Read files
- LOGGER.info("Read file into hdfs");
- //Create a path
- hdfsreadpath = new Path(newDirectoryPath + "/" + fileName); // The path should be the same that was used in writing the file to HDFS.
- //Init input stream
- inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- reader = new DataFileStream<>(inputStream, new SpecificDatumReader<>(SyslogRecord.class));
- record = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090804000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 0, \"origin\": \"jla-02.default\", \"payload\": \"[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090806000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 1, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 2, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 3, \"origin\": \"jla-02\", \"payload\": \"470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090822000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 4, \"origin\": \"jla-02\", \"payload\": \"470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092238000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 5, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 6, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092239000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 7, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 8, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092240000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 9, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 10, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092241000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 11, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092242000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 12, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]\"}",
- record.toString()
- );
-
- Assertions.assertTrue(reader.hasNext());
- record = reader.next(record);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872092243000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \""
- + partitionCounter
- + "\", \"offset\": 13, \"origin\": \"jla-02.default\", \"payload\": \"25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]\"}",
- record.toString()
- );
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
+ for (int offset = 0; offset <= 13; offset++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(offset, record.getOffset());
+ }
Assertions.assertFalse(reader.hasNext());
- LOGGER.info("Partition {} passed assertions.", partitionCounter);
- partitionCounter++;
- inputStream.close();
+ reader.close();
+ avroFile.delete();
}
- Assertions.assertEquals(10, partitionCounter);
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java b/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java
index 4424918b..73b68fd0 100644
--- a/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/Ingestion2OldFilesTest.java
@@ -45,23 +45,26 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.nio.file.Files;
-import java.util.Objects;
-import java.util.Set;
+import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -72,24 +75,71 @@ public class Ingestion2OldFilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(Ingestion2OldFilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "30000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
// Inserts pre-made avro-files with old timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -144,33 +194,45 @@ public void ingestion2OldFilesTest() {
assertDoesNotThrow(() -> {
// Assert the known starting state.
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct.
- Assertions.assertTrue((System.currentTimeMillis() - config.getPruneOffset()) > 157784760000L);
- config.setMaximumFileSize(30000);
- HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config);
- Thread.sleep(10000);
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ Assertions.assertTrue((System.currentTimeMillis() - hdfsConfig.pruneOffset()) > 157784760000L);
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
hdfsDataIngestion.run();
- // Assert that the kafka records were ingested and pruned correctly and the database holds only the expected 9 files.
- Assertions
- .assertEquals(9, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ // Assert that the kafka records were ingested and pruned correctly and the database doesn't hold any files.
Assertions
- .assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13")));
+ .assertEquals(0, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+
+ // Assert the avro-files that were too small to be stored in HDFS.
+ String path1 = config.queueDirectory() + "/" + "testConsumerTopic0.1";
+ File avroFile1 = new File(path1);
+ Assertions.assertFalse(avroFile1.exists()); // Partition 0 avro-file shouldn't exist because there are no records left in the buffer.
+
+ List filenameList = new ArrayList<>();
+ for (int i = 1; i <= 9; i++) {
+ filenameList.add("testConsumerTopic" + i + "." + 1);
+ }
+ for (String fileName : filenameList) {
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
+ for (int i = 0; i <= 13; i++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(i, record.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+ reader.close();
+ avroFile.delete();
+ }
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/IngestionConsumerTimeoutTest.java b/src/test/java/com/teragrep/cfe_39/IngestionConsumerTimeoutTest.java
new file mode 100644
index 00000000..a72e7ca1
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/IngestionConsumerTimeoutTest.java
@@ -0,0 +1,221 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.HdfsDataIngestion;
+import org.apache.avro.file.DataFileStream;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class IngestionConsumerTimeoutTest {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(IngestionConsumerTimeoutTest.class);
+ private static MiniDFSCluster hdfsCluster;
+ private static File baseDir;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
+ private FileSystem fs;
+
+ // Prepares known state for testing.
+ @BeforeEach
+ public void startMiniCluster() {
+ assertDoesNotThrow(() -> {
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "1000"); // Low consumerTimeout
+ config = new CommonConfiguration(map);
+
+ // Create a HDFS miniCluster
+ baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
+ });
+ }
+
+ // Teardown the minicluster
+ @AfterEach
+ public void teardownMiniCluster() {
+ assertDoesNotThrow(() -> {
+ fs.close();
+ });
+ hdfsCluster.shutdown();
+ FileUtil.fullyDelete(baseDir);
+ }
+
+ @DisabledIfSystemProperty(
+ named = "skipIngestionTest",
+ matches = "true"
+ )
+ @Test
+ public void ingestion0FilesTest() {
+ /*This test case is for testing the functionality of the consumerTimeout.*/
+ assertDoesNotThrow(() -> {
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct.
+ Assertions.assertEquals(1000, config.consumerTimeout());
+ Assertions.assertEquals(3000000, hdfsConfig.maximumFileSize());
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
+ HdfsDataIngestion hdfsDataIngestion = new HdfsDataIngestion(config, hdfsConfig, kafkaConfig);
+ hdfsDataIngestion.run();
+ });
+
+ // Assert that the kafka records were ingested correctly, HDFS should hold all the records even though maximumFileSize is set higher than expected file sizes.
+ assertDoesNotThrow(() -> {
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic";
+ Path newDirectoryPath = new Path(path);
+ Assertions.assertTrue(fs.exists(newDirectoryPath));
+
+ FileStatus[] fileStatuses = fs.listStatus(newDirectoryPath);
+ Assertions.assertEquals(10, fileStatuses.length);
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "1.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "2.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "3.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "4.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "5.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "6.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "7.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "8.13")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "9.13")));
+
+ // Assert that the expected records are present in hdfs files
+ for (int i = 0; i <= 9; i++) {
+ Path hdfsreadpath = new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + i + ".13");
+ //Init input stream
+ FSDataInputStream inputStream = fs.open(hdfsreadpath);
+ //The data is in AVRO-format, so it can't be read as a string.
+ DataFileStream reader = new DataFileStream<>(
+ inputStream,
+ new SpecificDatumReader<>(SyslogRecord.class)
+ );
+ for (int j = 0; j <= 13; j++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord syslogRecord = reader.next();
+ Assertions.assertEquals(j, syslogRecord.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+ }
+
+ // Assert that all the temporary AVRO-files generated by PartitionFile objects during consumption were deleted to prepare for new records.
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ Assertions.assertEquals(0, files.length);
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/KafkaConfigurationTest.java b/src/test/java/com/teragrep/cfe_39/KafkaConfigurationTest.java
new file mode 100644
index 00000000..6a997799
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/KafkaConfigurationTest.java
@@ -0,0 +1,95 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
+import com.teragrep.cnf_01.PathConfiguration;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class KafkaConfigurationTest {
+
+ private final Logger LOGGER = LoggerFactory.getLogger(KafkaConfigurationTest.class);
+
+ @Test
+ public void configurationTest() {
+ assertDoesNotThrow(() -> {
+ final PathConfiguration kafkaPathConfiguration = new PathConfiguration(
+ System.getProperty("user.dir") + "/src/test/resources/valid.kafka.properties"
+ );
+ final Map kafkaMap;
+ kafkaMap = kafkaPathConfiguration.asMap();
+ Assertions
+ .assertEquals(
+ "{java.security.auth.login.config=/opt/teragrep/cfe_39/etc/config.jaas, numOfConsumers=2, useMockKafkaConsumer=true, max.poll.records=500, request.timeout.ms=300000, group.id=cfe_39, bootstrap.servers=test, security.protocol=SASL_PLAINTEXT, enable.auto.commit=false, sasl.mechanism=PLAIN, fetch.max.bytes=1073741820, max.poll.interval.ms=300000, auto.offset.reset=earliest}",
+ kafkaMap.toString()
+ );
+ KafkaConfiguration kafkaConfig = new KafkaConfiguration(kafkaMap);
+
+ // Assert that printers return correct values.
+ Assertions.assertEquals("/opt/teragrep/cfe_39/etc/config.jaas", kafkaConfig.javaSecurityAuthLoginConfig());
+ Assertions.assertEquals("test", kafkaConfig.bootstrapServers());
+ Assertions.assertEquals("earliest", kafkaConfig.autoOffsetReset());
+ Assertions.assertEquals("false", kafkaConfig.enableAutoCommit());
+ Assertions.assertEquals("cfe_39", kafkaConfig.groupId());
+ Assertions.assertEquals("SASL_PLAINTEXT", kafkaConfig.securityProtocol());
+ Assertions.assertEquals("PLAIN", kafkaConfig.saslMechanism());
+ Assertions.assertEquals(500, kafkaConfig.maxPollRecords());
+ Assertions.assertEquals(1073741820, kafkaConfig.fetchMaxBytes());
+ Assertions.assertEquals(300000, kafkaConfig.requestTimeoutMs());
+ Assertions.assertEquals(300000, kafkaConfig.maxPollIntervalMs());
+ Assertions.assertTrue(kafkaConfig.useMockKafkaConsumer());
+ Assertions.assertEquals(2, kafkaConfig.numOfConsumers());
+
+ });
+ }
+}
diff --git a/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java b/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java
index 0087786a..fb257e69 100644
--- a/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java
+++ b/src/test/java/com/teragrep/cfe_39/KafkaConsumerTest.java
@@ -45,19 +45,28 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.configuration.KafkaConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.BatchDistributionImpl;
import com.teragrep.cfe_39.consumers.kafka.ReadCoordinator;
-import com.teragrep.cfe_39.consumers.kafka.RecordOffset;
-import com.teragrep.rlo_06.ParseException;
-import com.teragrep.rlo_06.RFC5424Frame;
+import com.teragrep.cfe_39.metrics.DurationStatistics;
+import com.teragrep.cfe_39.metrics.topic.TopicCounter;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.kafka.common.TopicPartition;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.file.Files;
import java.util.*;
-import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -65,586 +74,207 @@ public class KafkaConsumerTest {
private static final Logger LOGGER = LoggerFactory.getLogger(KafkaConsumerTest.class);
+ private static MiniDFSCluster hdfsCluster;
+ private static File baseDir;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
+ private static KafkaConfiguration kafkaConfig;
+ private FileSystem fs;
+
+ // Prepares known state for testing.
+ @BeforeEach
+ public void startMiniCluster() {
+ assertDoesNotThrow(() -> {
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
+ // Create a HDFS miniCluster
+ baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "30000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
+
+ Map kafkaMap = new HashMap<>();
+ kafkaMap.put("java.security.auth.login.config", "/opt/teragrep/cfe_39/etc/config.jaas");
+ kafkaMap.put("bootstrap.servers", "test");
+ kafkaMap.put("auto.offset.reset", "earliest");
+ kafkaMap.put("enable.auto.commit", "false");
+ kafkaMap.put("group.id", "cfe_39");
+ kafkaMap.put("security.protocol", "SASL_PLAINTEXT");
+ kafkaMap.put("sasl.mechanism", "PLAIN");
+ kafkaMap.put("max.poll.records", "500");
+ kafkaMap.put("fetch.max.bytes", "1073741820");
+ kafkaMap.put("request.timeout.ms", "300000");
+ kafkaMap.put("max.poll.interval.ms", "300000");
+ kafkaMap.put("useMockKafkaConsumer", "true");
+ kafkaMap.put("numOfConsumers", "2");
+ kafkaConfig = new KafkaConfiguration(kafkaMap);
+ });
+ }
+
+ // Teardown the minicluster
+ @AfterEach
+ public void teardownMiniCluster() {
+ assertDoesNotThrow(() -> {
+ fs.close();
+ });
+ hdfsCluster.shutdown();
+ FileUtil.fullyDelete(baseDir);
+ }
+
@Test
public void readCoordinatorTest2Threads() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- Config config = new Config();
Map hdfsStartOffsets = new HashMap<>();
- ArrayList> messages = new ArrayList<>();
- Consumer> output = message -> messages.add(message);
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+ // BatchDistributionImpl can not be used as a functional interface.
+ BatchDistributionImpl output1 = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
+ BatchDistributionImpl output2 = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
ReadCoordinator readCoordinator = new ReadCoordinator(
"testConsumerTopic",
- config.getKafkaConsumerProperties(),
- output,
+ config,
+ kafkaConfig,
+ hdfsConfig,
+ output1,
hdfsStartOffsets
);
Thread readThread = new Thread(null, readCoordinator, "testConsumerTopic1"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic.
readThread.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator.
- Thread.sleep(1000);
-
ReadCoordinator readCoordinator2 = new ReadCoordinator(
"testConsumerTopic",
- config.getKafkaConsumerProperties(),
- output,
+ config,
+ kafkaConfig,
+ hdfsConfig,
+ output2,
hdfsStartOffsets
);
Thread readThread2 = new Thread(null, readCoordinator2, "testConsumerTopic2"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic.
readThread2.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator.
- Thread.sleep(10000);
- Assertions.assertEquals(2, messages.size());
- Assertions.assertEquals(160, messages.get(0).size() + messages.get(1).size()); // Assert that expected amount of records has been consumed by the consumer group.
- Assertions.assertEquals(80, messages.get(0).size());
- Assertions.assertEquals(80, messages.get(1).size());
-
- // Assert that all the record contents are correct, every topic partition has identical set of offset-message pairings.
- List messageList = new ArrayList();
- messageList.add("[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!");
- messageList.add("[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!");
- messageList.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.");
- messageList.add("470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.");
- messageList.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.");
- messageList
- .add(
- "25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]"
- );
- messageList
- .add(
- "25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]"
- );
-
- RFC5424Frame rfc5424Frame = new RFC5424Frame(false);
-
- RecordOffset recordOffset;
-
- Iterator iterator = messageList.iterator();
- int counter = 0;
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":7, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- ParseException e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":5, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":3, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":1, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":9, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- Assertions.assertEquals(80, counter);
-
- counter = 0;
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":8, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":6, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
- }
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
+ Thread.sleep(10000); // Allow read threads to have enough time to execute their tasks properly.
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":4, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
+ // Because BatchDistributionImpl can not be used as a functional interface, must do assertion through avro-files until better solution is found (add fake to interface?).
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
+ // Assert the records inside the avro-files
+ List filenameList = new ArrayList<>();
+ for (int i = 0; i <= 9; i++) {
+ filenameList.add("testConsumerTopic" + i + "." + 1);
}
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":2, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- iterator = messageList.iterator();
- for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + i + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
+ for (String fileName : filenameList) {
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
+ for (int i = 0; i <= 13; i++) {
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(i, record.getOffset());
+ }
+ Assertions.assertFalse(reader.hasNext());
+ reader.close();
+ avroFile.delete();
}
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + 14 + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(1).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":0, \"offset\":" + 15 + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
-
- Assertions.assertEquals(80, counter);
-
});
}
@Test
public void readCoordinatorTest1Thread() {
+
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- Config config = new Config();
Map hdfsStartOffsets = new HashMap<>();
- ArrayList> messages = new ArrayList<>();
- Consumer> output = message -> messages.add(message);
+ DurationStatistics durationStatistics = new DurationStatistics();
+ durationStatistics.register();
+ // BatchDistributionImpl can not be used as a functional interface.
+ BatchDistributionImpl output = new BatchDistributionImpl(
+ config, // Configuration settings
+ hdfsConfig,
+ "topicName", // String, the name of the topic
+ durationStatistics, // RuntimeStatistics object from metrics
+ new TopicCounter("topicName") // TopicCounter object from metrics
+ );
ReadCoordinator readCoordinator = new ReadCoordinator(
"testConsumerTopic",
- config.getKafkaConsumerProperties(),
+ config,
+ kafkaConfig,
+ hdfsConfig,
output,
hdfsStartOffsets
);
Thread readThread = new Thread(null, readCoordinator, "testConsumerTopic0"); // Starts the thread with readCoordinator that creates the consumer and subscribes to the topic.
readThread.start(); // Starts the thread, in other words proceeds to call run() function of ReadCoordinator.
- Thread.sleep(10000);
- Assertions.assertEquals(1, messages.size());
- Assertions.assertEquals(160, messages.get(0).size()); // Assert that expected amount of records has been consumed by the consumer.
+ Thread.sleep(10000); // Allow read thread to have enough time to execute the task properly.
- // Assert that all the record contents are correct, every topic partition has identical set of offset-message pairings.
- List list = new ArrayList();
- list.add("[WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!");
- list.add("[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!");
- list.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi.");
- list.add("470646 [Thread-3] INFO com.teragrep.jla_02.Logback Audit - Logback-audit says hi.");
- list.add("470647 [Thread-3] INFO com.teragrep.jla_02.Logback Metric - Logback-metric says hi.");
- list
- .add(
- "25.04.2022 07:34:52.238 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info audit says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info daily says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.239 [INFO] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 info metric says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn audit says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.240 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn daily says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.241 [WARN] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 warn metric says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.241 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error audit says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.242 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error daily says hi!]"
- );
- list
- .add(
- "25.04.2022 07:34:52.243 [ERROR] com.teragrep.jla_02.Log4j2 [instanceId=01, thread=Thread-0, userId=, sessionId=, requestId=, SUBJECT=, VERB=, OBJECT=, OUTCOME=, message=Log4j2 error metric says hi!]"
- );
+ // Because BatchDistributionImpl can not be used as a functional interface, must do assertion through avro-files until better solution is found (add fake to interface?).
- RFC5424Frame rfc5424Frame = new RFC5424Frame(false);
- RecordOffset recordOffset;
- Iterator iterator;
- List partitionList = new ArrayList();
- partitionList.add(7);
- partitionList.add(8);
- partitionList.add(5);
- partitionList.add(6);
- partitionList.add(3);
- partitionList.add(4);
- partitionList.add(1);
- partitionList.add(2);
- partitionList.add(0);
- partitionList.add(9);
- int counter = 0;
- for (int partition : partitionList) {
- iterator = list.iterator();
+ // Assert the records inside the avro-files
+ List filenameList = new ArrayList<>();
+ for (int i = 0; i <= 9; i++) {
+ filenameList.add("testConsumerTopic" + i + "." + 1);
+ }
+ for (String fileName : filenameList) {
+ String path2 = config.queueDirectory() + "/" + fileName;
+ File avroFile = new File(path2);
+ Assertions.assertTrue(filenameList.contains(avroFile.getName()));
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader reader = new DataFileReader<>(avroFile, datumReader);
for (int i = 0; i <= 13; i++) {
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + i
- + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- Assertions.assertTrue(rfc5424Frame.next());
- Assertions.assertTrue(iterator.hasNext());
- Assertions.assertEquals(iterator.next(), rfc5424Frame.msg.toString());
- Assertions.assertFalse(rfc5424Frame.next());
- counter++;
+ Assertions.assertTrue(reader.hasNext());
+ SyslogRecord record = reader.next();
+ Assertions.assertEquals(i, record.getOffset());
}
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + 14
- + "}",
- recordOffset.offsetToJSON()
- );
- Assertions.assertNull(recordOffset.getRecord());
- counter++;
-
- recordOffset = messages.get(0).get(counter);
- Assertions
- .assertEquals(
- "{\"topic\":\"testConsumerTopic\", \"partition\":" + partition + ", \"offset\":" + 15
- + "}",
- recordOffset.offsetToJSON()
- );
- rfc5424Frame.load(new ByteArrayInputStream(recordOffset.getRecord()));
- ParseException e = Assertions.assertThrows(ParseException.class, rfc5424Frame::next);
- Assertions.assertEquals("PRIORITY < missing", e.getMessage());
- counter++;
+ Assertions.assertFalse(reader.hasNext());
+ reader.close();
+ avroFile.delete();
}
- Assertions.assertEquals(160, counter); // All 160 records were asserted.
-
});
}
diff --git a/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java b/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java
index 63d83e8c..0c0bd10e 100644
--- a/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java
+++ b/src/test/java/com/teragrep/cfe_39/ProcessingFailureTest.java
@@ -45,8 +45,10 @@
*/
package com.teragrep.cfe_39;
-import com.teragrep.cfe_39.consumers.kafka.DatabaseOutput;
-import com.teragrep.cfe_39.consumers.kafka.RecordOffset;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.BatchDistributionImpl;
+import com.teragrep.cfe_39.consumers.kafka.KafkaRecordImpl;
import com.teragrep.cfe_39.metrics.DurationStatistics;
import com.teragrep.cfe_39.metrics.topic.TopicCounter;
import org.apache.hadoop.fs.FileSystem;
@@ -65,9 +67,10 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -78,21 +81,51 @@ public class ProcessingFailureTest {
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static CommonConfiguration config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration with skipping of broken records disabled.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/failProcessing.application.properties");
- config = new Config();
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "false");
+ map.put("skipEmptyRFC5424Records", "false");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
});
}
@@ -117,8 +150,9 @@ public void failNonRFC5424DatabaseOutputTest() {
assertDoesNotThrow(() -> {
- Consumer> output = new DatabaseOutput(
+ BatchDistributionImpl output = new BatchDistributionImpl(
config, // Configuration settings
+ hdfsConfig,
"topicName", // String, the name of the topic
durationStatistics, // RuntimeStatistics object from metrics
new TopicCounter("topicName") // TopicCounter object from metrics
@@ -132,19 +166,24 @@ public void failNonRFC5424DatabaseOutputTest() {
"12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
.getBytes(StandardCharsets.UTF_8)
);
- RecordOffset recordOffsetObject = new RecordOffset(
+ KafkaRecordImpl recordOffsetObject = new KafkaRecordImpl(
record.topic(),
record.partition(),
record.offset(),
record.value()
);
- List recordOffsetObjectList = new ArrayList<>();
+ List recordOffsetObjectList = new ArrayList<>();
recordOffsetObjectList.add(recordOffsetObject);
Exception e = Assertions.assertThrows(Exception.class, () -> output.accept(recordOffsetObjectList));
Assertions.assertEquals("com.teragrep.rlo_06.PriorityParseException: PRIORITY < missing", e.getMessage());
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1")));
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.1")));
// No files stored to hdfs.
+
+ // Assert the local avro file that should e empty.
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ Assertions.assertEquals(0, files.length); // Partition 0 avro-file shouldn't exist because there are no records left in the buffer.
});
}
@@ -160,8 +199,9 @@ public void failNullRFC5424DatabaseOutputTest() {
assertDoesNotThrow(() -> {
- Consumer> output = new DatabaseOutput(
+ BatchDistributionImpl output = new BatchDistributionImpl(
config, // Configuration settings
+ hdfsConfig,
"topicName", // String, the name of the topic
durationStatistics, // RuntimeStatistics object from metrics
new TopicCounter("topicName") // TopicCounter object from metrics
@@ -174,20 +214,29 @@ public void failNullRFC5424DatabaseOutputTest() {
"2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
null
);
- RecordOffset recordOffsetObject = new RecordOffset(
+ KafkaRecordImpl recordOffsetObject = new KafkaRecordImpl(
record.topic(),
record.partition(),
record.offset(),
record.value()
);
- List recordOffsetObjectList = new ArrayList<>();
+ List recordOffsetObjectList = new ArrayList<>();
recordOffsetObjectList.add(recordOffsetObject);
- NullPointerException e = Assertions
- .assertThrows(NullPointerException.class, () -> output.accept(recordOffsetObjectList));
- Assertions.assertEquals("Record with null content detected during processing.", e.getMessage());
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1")));
+ RuntimeException e = Assertions
+ .assertThrows(RuntimeException.class, () -> output.accept(recordOffsetObjectList));
+ Assertions
+ .assertEquals(
+ "java.lang.NullPointerException: Cannot read the array length because \"buf\" is null",
+ e.getMessage()
+ );
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "topicName" + "/" + "0.1")));
// No files stored to hdfs.
+
+ // Assert the local avro file that should be empty.
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ Assertions.assertEquals(0, files.length); // Partition 0 avro-file shouldn't exist because there are no records left in the buffer.
});
}
diff --git a/src/test/java/com/teragrep/cfe_39/ProcessingTest.java b/src/test/java/com/teragrep/cfe_39/ProcessingTest.java
deleted file mode 100644
index 17820c7f..00000000
--- a/src/test/java/com/teragrep/cfe_39/ProcessingTest.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * HDFS Data Ingestion for PTH_06 use CFE-39
- * Copyright (C) 2021-2024 Suomen Kanuuna Oy
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- *
- * Additional permission under GNU Affero General Public License version 3
- * section 7
- *
- * If you modify this Program, or any covered work, by linking or combining it
- * with other code, such other code is not for that reason alone subject to any
- * of the requirements of the GNU Affero GPL version 3 as long as this Program
- * is the same Program as licensed from Suomen Kanuuna Oy without any additional
- * modifications.
- *
- * Supplemented terms under GNU Affero General Public License version 3
- * section 7
- *
- * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
- * versions must be marked as "Modified version of" The Program.
- *
- * Names of the licensors and authors may not be used for publicity purposes.
- *
- * No rights are granted for use of trade names, trademarks, or service marks
- * which are in The Program if any.
- *
- * Licensee must indemnify licensors and authors for any liability that these
- * contractual assumptions impose on licensors and authors.
- *
- * To the extent this program is licensed as part of the Commercial versions of
- * Teragrep, the applicable Commercial License may apply to this file if you as
- * a licensee so wish it.
- */
-package com.teragrep.cfe_39;
-
-import com.teragrep.cfe_39.avro.SyslogRecord;
-import com.teragrep.cfe_39.consumers.kafka.DatabaseOutput;
-import com.teragrep.cfe_39.consumers.kafka.RecordOffset;
-import com.teragrep.cfe_39.metrics.DurationStatistics;
-import com.teragrep.cfe_39.metrics.topic.TopicCounter;
-import org.apache.avro.file.DataFileStream;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.kafka.clients.consumer.ConsumerRecord;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.function.Consumer;
-
-import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
-
-// Tests for processing of consumed kafka records with skipping of broken records enabled (both null and non rfc5424).
-public class ProcessingTest {
-
- private static final Logger LOGGER = LoggerFactory.getLogger(ProcessingTest.class);
-
- private static MiniDFSCluster hdfsCluster;
- private static File baseDir;
- private static Config config;
- private FileSystem fs;
-
- // Prepares known state for testing.
- @BeforeEach
- public void startMiniCluster() {
- assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
- // Create a HDFS miniCluster
- baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
- });
- }
-
- // Teardown the minicluster
- @AfterEach
- public void teardownMiniCluster() {
- assertDoesNotThrow(() -> {
- fs.close();
- });
- hdfsCluster.shutdown();
- FileUtil.fullyDelete(baseDir);
- }
-
- @Test
- public void skipNonRFC5424DatabaseOutputTest() {
- // Initialize and register duration statistics
- DurationStatistics durationStatistics = new DurationStatistics();
- durationStatistics.register();
-
- // register per topic counting
- List topicCounters = new CopyOnWriteArrayList<>();
-
- assertDoesNotThrow(() -> {
-
- Consumer> output = new DatabaseOutput(
- config, // Configuration settings
- "topicName", // String, the name of the topic
- durationStatistics, // RuntimeStatistics object from metrics
- new TopicCounter("topicName") // TopicCounter object from metrics
- );
-
- ConsumerRecord record = new ConsumerRecord<>(
- "topicName",
- 0,
- 1L,
- "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
- "12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
- .getBytes(StandardCharsets.UTF_8)
- );
- RecordOffset recordOffsetObject = new RecordOffset(
- record.topic(),
- record.partition(),
- record.offset(),
- record.value()
- );
-
- List recordOffsetObjectList = new ArrayList<>();
- recordOffsetObjectList.add(recordOffsetObject);
- output.accept(recordOffsetObjectList);
- Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1")));
- // File in hdfs does not contain any records, but acts as a marker for kafka consumer offsets.
-
- // Assert that the file in hdfs contains the expected zero record.
-
- Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1");
- //Init input stream
- FSDataInputStream inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- DataFileStream reader = new DataFileStream<>(
- inputStream,
- new SpecificDatumReader<>(SyslogRecord.class)
- );
- SyslogRecord syslogRecord = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
-
- Assertions.assertFalse(reader.hasNext());
- });
-
- }
-
- @Test
- public void skipNullRFC5424DatabaseOutputTest() {
- // Initialize and register duration statistics
- DurationStatistics durationStatistics = new DurationStatistics();
- durationStatistics.register();
-
- // register per topic counting
- List topicCounters = new CopyOnWriteArrayList<>();
-
- assertDoesNotThrow(() -> {
-
- Consumer> output = new DatabaseOutput(
- config, // Configuration settings
- "topicName", // String, the name of the topic
- durationStatistics, // RuntimeStatistics object from metrics
- new TopicCounter("topicName") // TopicCounter object from metrics
- );
-
- ConsumerRecord record = new ConsumerRecord<>(
- "topicName",
- 0,
- 1L,
- "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
- null
- );
- RecordOffset recordOffsetObject = new RecordOffset(
- record.topic(),
- record.partition(),
- record.offset(),
- record.value()
- );
-
- List recordOffsetObjectList = new ArrayList<>();
- recordOffsetObjectList.add(recordOffsetObject);
- output.accept(recordOffsetObjectList);
- Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1")));
- // File in hdfs does not contain any records, but acts as a marker for kafka consumer offsets.
-
- // Assert that the file in hdfs contains the expected zero record.
-
- Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.1");
- //Init input stream
- FSDataInputStream inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- DataFileStream reader = new DataFileStream<>(
- inputStream,
- new SpecificDatumReader<>(SyslogRecord.class)
- );
- SyslogRecord syslogRecord = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
-
- Assertions.assertFalse(reader.hasNext());
- });
-
- }
-
- @Test
- public void skipNullAndNonRFC5424DatabaseOutputTest() {
- // Initialize and register duration statistics
- DurationStatistics durationStatistics = new DurationStatistics();
- durationStatistics.register();
-
- // register per topic counting
- List topicCounters = new CopyOnWriteArrayList<>();
-
- assertDoesNotThrow(() -> {
-
- Consumer> output = new DatabaseOutput(
- config, // Configuration settings
- "topicName", // String, the name of the topic
- durationStatistics, // RuntimeStatistics object from metrics
- new TopicCounter("topicName") // TopicCounter object from metrics
- );
-
- List recordOffsetObjectList = new ArrayList<>();
-
- ConsumerRecord record = new ConsumerRecord<>(
- "topicName",
- 0,
- 1L,
- "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
- null
- );
- RecordOffset recordOffsetObject = new RecordOffset(
- record.topic(),
- record.partition(),
- record.offset(),
- record.value()
- );
- recordOffsetObjectList.add(recordOffsetObject);
-
- record = new ConsumerRecord<>(
- "topicName",
- 0,
- 2L,
- "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
- "12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
- .getBytes(StandardCharsets.UTF_8)
- );
- recordOffsetObject = new RecordOffset(record.topic(), record.partition(), record.offset(), record.value());
- recordOffsetObjectList.add(recordOffsetObject);
- record = new ConsumerRecord<>(
- "topicName",
- 0,
- 3L,
- "2022-04-25T07:34:50.807Z".getBytes(StandardCharsets.UTF_8),
- "<12>1 2022-04-25T07:34:50.807Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
- .getBytes(StandardCharsets.UTF_8)
- );
- recordOffsetObject = new RecordOffset(record.topic(), record.partition(), record.offset(), record.value());
- recordOffsetObjectList.add(recordOffsetObject);
- output.accept(recordOffsetObjectList);
- Assertions.assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "topicName")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.3")));
-
- // Assert that the file in hdfs contains the expected single record.
-
- Path hdfsreadpath = new Path(config.getHdfsPath() + "/" + "topicName" + "/" + "0.3");
- //Init input stream
- FSDataInputStream inputStream = fs.open(hdfsreadpath);
- //The data is in AVRO-format, so it can't be read as a string.
- DataFileStream reader = new DataFileStream<>(
- inputStream,
- new SpecificDatumReader<>(SyslogRecord.class)
- );
- SyslogRecord syslogRecord = null;
- LOGGER.info("\nReading records from file {}:", hdfsreadpath);
-
- Assertions.assertTrue(reader.hasNext());
- syslogRecord = reader.next(syslogRecord);
- Assertions
- .assertEquals(
- "{\"timestamp\": 1650872090807000, \"directory\": \"jla02logger\", \"stream\": \"test:jla02logger:0\", \"host\": \"jla-02.default\", \"input\": \"imrelp:cfe-06-0.cfe-06.default:\", \"partition\": \"0\", \"offset\": 3, \"origin\": \"jla-02.default\", \"payload\": \"[ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!\"}",
- syslogRecord.toString()
- );
- Assertions.assertFalse(reader.hasNext());
-
- });
- }
-}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java
index f89603d3..c83e592c 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningNoFilesTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -67,21 +70,34 @@ public class PruningNoFilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningNoFilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Start minicluster and initialize config.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
});
}
@@ -98,15 +114,15 @@ public void teardownMiniCluster() {
public void noFiles() {
// This test case is for testing the functionality of the HDFSPrune.java when the target database is empty.
assertDoesNotThrow(() -> {
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
+ .assertEquals(0, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(0, deleted);
Assertions
- .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
+ .assertEquals(0, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java
index bcd06660..98cbd310 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningOneNewFileTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -71,24 +74,37 @@ public class PruningOneNewFileTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneNewFileTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
// Inserts a single pre-made avro-file with a new timestamp to HDFS, which is normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -129,21 +145,21 @@ public void teardownMiniCluster() {
@Test
public void oneNewFileTest() {
// This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that shouldn't trigger pruning of old files.
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
- Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L);
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
+ Assertions.assertTrue(System.currentTimeMillis() - hdfsConfig.pruneOffset() > 157784760000L);
assertDoesNotThrow(() -> {
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(0, deleted);
// Also check with HDFS access if expected files still exist.
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java
index 0e7445f3..256e0034 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningOneOldFileTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -71,24 +74,37 @@ public class PruningOneOldFileTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneOldFileTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
// Inserts a single pre-made avro-file with an olf timestamp to HDFS, which is normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -129,21 +145,22 @@ public void teardownMiniCluster() {
@Test
public void oneOldFileTest() {
// This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that should trigger pruning of old files.
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
- Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L);
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
+ Assertions.assertTrue(System.currentTimeMillis() - hdfsConfig.pruneOffset() > 157784760000L);
assertDoesNotThrow(() -> {
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(1, deleted);
// Also check with HDFS access if expected files still exist.
Assertions
- .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(0, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions
+ .assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java b/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java
index 483e36dc..ce131d5d 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningOneOldOneNewFileTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -71,25 +74,39 @@ public class PruningOneOldOneNewFileTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningOneOldOneNewFileTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
+
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
/* Inserts pre-made avro-files to HDFS, which are normally generated during data ingestion from mock kafka consumer.
One file has new timestamp and another old timestamp.*/
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -133,23 +150,26 @@ public void teardownMiniCluster() {
public void oneOldOneNewFileTest() {
/* This test case is for testing the functionality of the HDFSPrune.java when the database holds a file with a timestamp that shouldn't trigger pruning of old files and another file that should trigger the pruning.
The file with newer timestamp is ignored while the older is deleted from the database.*/
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
- Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L);
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
+ Assertions.assertTrue(System.currentTimeMillis() - hdfsConfig.pruneOffset() > 157784760000L);
assertDoesNotThrow(() -> {
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
+ Assertions
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(1, deleted);
// Also check with HDFS access if expected files still exist.
Assertions
- .assertEquals(1, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ .assertEquals(1, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions
+ .assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java
index 0f3b450c..f58ea55b 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningTwoNewFilesTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -71,25 +74,38 @@ public class PruningTwoNewFilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningTwoNewFilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
// Inserts pre-made avro-files with new timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -131,21 +147,23 @@ public void teardownMiniCluster() {
@Test
public void twoNewFilesTest() {
// This test case is for testing the functionality of the HDFSPrune.java when the database holds two files with a timestamp that shouldn't trigger pruning of old files.
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
assertDoesNotThrow(() -> {
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(0, deleted);
// Also check with HDFS access if expected files still exist.
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java b/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java
index 0a5ae764..7987fd37 100644
--- a/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java
+++ b/src/test/java/com/teragrep/cfe_39/PruningTwoOldFilesTest.java
@@ -45,6 +45,7 @@
*/
package com.teragrep.cfe_39;
+import com.teragrep.cfe_39.configuration.HdfsConfiguration;
import com.teragrep.cfe_39.consumers.kafka.HDFSPrune;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -59,6 +60,8 @@
import java.io.File;
import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -71,25 +74,38 @@ public class PruningTwoOldFilesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PruningTwoOldFilesTest.class);
private static MiniDFSCluster hdfsCluster;
private static File baseDir;
- private static Config config;
+ private static HdfsConfiguration hdfsConfig;
private FileSystem fs;
// Prepares known state for testing.
@BeforeEach
public void startMiniCluster() {
assertDoesNotThrow(() -> {
- // Set system properties to use the valid configuration.
- System
- .setProperty("cfe_39.config.location", System.getProperty("user.dir") + "/src/test/resources/valid.application.properties");
- config = new Config();
// Create a HDFS miniCluster
baseDir = Files.createTempDirectory("test_hdfs").toFile().getAbsoluteFile();
- hdfsCluster = new TestMiniClusterFactory().create(config, baseDir);
- fs = new TestFileSystemFactory().create(config.getHdfsuri());
+ hdfsCluster = new TestMiniClusterFactory().create(baseDir);
+ Map hdfsMap = new HashMap<>();
+ hdfsMap.put("pruneOffset", "157784760000");
+ hdfsMap.put("hdfsuri", "hdfs://localhost:" + hdfsCluster.getNameNodePort() + "/");
+ hdfsMap.put("hdfsPath", "hdfs:///opt/teragrep/cfe_39/srv/");
+ hdfsMap.put("java.security.krb5.kdc", "test");
+ hdfsMap.put("java.security.krb5.realm", "test");
+ hdfsMap.put("hadoop.security.authentication", "false");
+ hdfsMap.put("hadoop.security.authorization", "test");
+ hdfsMap.put("dfs.namenode.kerberos.principal.pattern", "test");
+ hdfsMap.put("KerberosKeytabUser", "test");
+ hdfsMap.put("KerberosKeytabPath", "test");
+ hdfsMap.put("dfs.client.use.datanode.hostname", "false");
+ hdfsMap.put("hadoop.kerberos.keytab.login.autorenewal.enabled", "true");
+ hdfsMap.put("dfs.data.transfer.protection", "test");
+ hdfsMap.put("dfs.encrypt.data.transfer.cipher.suites", "test");
+ hdfsMap.put("maximumFileSize", "3000");
+ hdfsConfig = new HdfsConfiguration(hdfsMap);
+ fs = new TestFileSystemFactory().create(hdfsConfig.hdfsUri());
// Inserts pre-made avro-files with old timestamps to HDFS, which are normally generated during data ingestion from mock kafka consumer.
- String path = config.getHdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
+ String path = hdfsConfig.hdfsPath() + "/" + "testConsumerTopic"; // "hdfs:///opt/teragrep/cfe_39/srv/testConsumerTopic"
// Sets the directory where the data should be stored, if the directory doesn't exist then it's created.
Path newDirectoryPath = new Path(path);
// Create new Directory
@@ -131,24 +147,26 @@ public void teardownMiniCluster() {
@Test
public void twoOldFilesTest() {
// This test case is for testing the functionality of the HDFSPrune.java when the database holds two files with a timestamp that should trigger pruning of old files.
- Assertions.assertTrue(config.getPruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
- Assertions.assertTrue(System.currentTimeMillis() - config.getPruneOffset() > 157784760000L);
+ Assertions.assertTrue(hdfsConfig.pruneOffset() >= 300000L); // Fails the test if the config is not correct, too low pruning offset can prune the files if the test is lagging.
+ Assertions.assertTrue(System.currentTimeMillis() - hdfsConfig.pruneOffset() > 157784760000L);
assertDoesNotThrow(() -> {
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")));
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")));
Assertions
- .assertEquals(2, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
- Assertions.assertTrue(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
- HDFSPrune hdfsPrune = new HDFSPrune(config, "testConsumerTopic", fs);
+ .assertEquals(2, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions.assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ Assertions
+ .assertTrue(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ HDFSPrune hdfsPrune = new HDFSPrune(hdfsConfig, "testConsumerTopic", fs);
int deleted = hdfsPrune.prune();
Assertions.assertEquals(2, deleted);
// Also check with HDFS access if expected files still exist.
Assertions
- .assertEquals(0, fs.listStatus(new Path(config.getHdfsPath() + "/" + "testConsumerTopic")).length);
- Assertions.assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
+ .assertEquals(0, fs.listStatus(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic")).length);
+ Assertions
+ .assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.9")));
Assertions
- .assertFalse(fs.exists(new Path(config.getHdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
+ .assertFalse(fs.exists(new Path(hdfsConfig.hdfsPath() + "/" + "testConsumerTopic" + "/" + "0.13")));
});
}
}
diff --git a/src/test/java/com/teragrep/cfe_39/SyslogAvroWriterTest.java b/src/test/java/com/teragrep/cfe_39/SyslogAvroWriterTest.java
new file mode 100644
index 00000000..43e2a621
--- /dev/null
+++ b/src/test/java/com/teragrep/cfe_39/SyslogAvroWriterTest.java
@@ -0,0 +1,188 @@
+/*
+ * HDFS Data Ingestion for PTH_06 use CFE-39
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.cfe_39;
+
+import com.teragrep.cfe_39.avro.SyslogRecord;
+import com.teragrep.cfe_39.configuration.CommonConfiguration;
+import com.teragrep.cfe_39.consumers.kafka.KafkaRecordImpl;
+import com.teragrep.cfe_39.consumers.kafka.SyslogAvroWriter;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+public class SyslogAvroWriterTest {
+
+ private static CommonConfiguration config;
+
+ // Prepares known state for testing.
+ @BeforeEach
+ public void startMiniCluster() {
+ assertDoesNotThrow(() -> {
+ File queueDir = new File(System.getProperty("user.dir") + "/target/AVRO");
+ if (!queueDir.exists()) {
+ queueDir.mkdirs();
+ }
+ Map map = new HashMap<>();
+ map.put("log4j2.configurationFile", "/opt/teragrep/cfe_39/etc/log4j2.properties");
+ map.put("egress.configurationFile", "/opt/teragrep/cfe_39/etc/egress.properties");
+ map.put("ingress.configurationFile", "/opt/teragrep/cfe_39/etc/ingress.properties");
+ map.put("queueDirectory", System.getProperty("user.dir") + "/target/AVRO/");
+ map.put("queueTopicPattern", "^testConsumerTopic-*$");
+ map.put("skipNonRFC5424Records", "true");
+ map.put("skipEmptyRFC5424Records", "true");
+ map.put("pruneOffset", "157784760000");
+ map.put("consumerTimeout", "600000");
+ config = new CommonConfiguration(map);
+ });
+ }
+
+ // Teardown the minicluster
+ @AfterEach
+ public void teardownMiniCluster() {
+ File queueDirectory = new File(config.queueDirectory());
+ File[] files = queueDirectory.listFiles();
+ if (files[0].getName().equals("topicName0.1")) {
+ files[0].delete();
+ }
+ }
+
+ @Test
+ public void writeTest() {
+
+ assertDoesNotThrow(() -> {
+
+ File queueDirectory = new File(config.queueDirectory());
+
+ File syslogFile = new File(config.queueDirectory() + File.separator + "topicName0.1");
+
+ ConsumerRecord record0 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 0L,
+ "2022-04-25T07:34:50.804Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.804Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"835bf792-91cf-44e3-976b-518330bb8fd3\" source=\"source\" unixtime=\"1650872090805\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [WARN] 2022-04-25 07:34:50,804 com.teragrep.jla_02.Log4j Log - Log4j warn says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl recordOffsetObject0 = new KafkaRecordImpl(
+ record0.topic(),
+ record0.partition(),
+ record0.offset(),
+ record0.value()
+ );
+
+ ConsumerRecord record1 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 1L,
+ "2022-04-25T07:34:50.806Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.806Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02.default\"][event_id@48577 hostname=\"jla-02.default\" uuid=\"c3f13f9a-05e2-41bd-b0ad-1eca6fd6fd9a\" source=\"source\" unixtime=\"1650872090806\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] [ERROR] 2022-04-25 07:34:50,806 com.teragrep.jla_02.Log4j Log - Log4j error says hi!"
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl recordOffsetObject1 = new KafkaRecordImpl(
+ record1.topic(),
+ record1.partition(),
+ record1.offset(),
+ record1.value()
+ );
+
+ ConsumerRecord record2 = new ConsumerRecord<>(
+ "topicName",
+ 0,
+ 2L,
+ "2022-04-25T07:34:50.822Z".getBytes(StandardCharsets.UTF_8),
+ "<12>1 2022-04-25T07:34:50.822Z jla-02.default jla02logger - - [origin@48577 hostname=\"jla-02\"][event_id@48577 hostname=\"jla-02\" uuid=\"1848d8a1-2f08-4a1e-bec4-ff9e6dd92553\" source=\"source\" unixtime=\"1650872090822\"][event_format@48577 original_format=\"rfc5424\"][event_node_relay@48577 hostname=\"cfe-06-0.cfe-06.default\" source=\"kafka-4.kafka.default.svc.cluster.local\" source_module=\"imrelp\"][event_version@48577 major=\"2\" minor=\"2\" hostname=\"cfe-06-0.cfe-06.default\" version_source=\"relay\"][event_node_router@48577 source=\"cfe-06-0.cfe-06.default.svc.cluster.local\" source_module=\"imrelp\" hostname=\"cfe-07-0.cfe-07.default\"][teragrep@48577 streamname=\"test:jla02logger:0\" directory=\"jla02logger\" unixtime=\"1650872090\"] 470647 [Thread-3] INFO com.teragrep.jla_02.Logback Daily - Logback-daily says hi."
+ .getBytes(StandardCharsets.UTF_8)
+ );
+ KafkaRecordImpl recordOffsetObject2 = new KafkaRecordImpl(
+ record2.topic(),
+ record2.partition(),
+ record2.offset(),
+ record2.value()
+ );
+
+ try (SyslogAvroWriter syslogAvroWriter = new SyslogAvroWriter(syslogFile)) {
+ syslogAvroWriter.write(recordOffsetObject0.toSyslogRecord());
+ syslogAvroWriter.write(recordOffsetObject1.toSyslogRecord());
+ syslogAvroWriter.write(recordOffsetObject2.toSyslogRecord());
+ }
+ try (SyslogAvroWriter syslogAvroWriter = new SyslogAvroWriter(syslogFile)) {
+ syslogAvroWriter.write(recordOffsetObject2.toSyslogRecord());
+ }
+ DatumReader datumReader = new SpecificDatumReader<>(SyslogRecord.class);
+ DataFileReader