From d15cb16dfb6a54979eeed4ba9fd80f3fc52fd1ab Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sat, 11 Jul 2015 00:08:51 +0100 Subject: [PATCH 01/11] Switch to using gradle for build --- build.gradle | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 build.gradle diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..5879927 --- /dev/null +++ b/build.gradle @@ -0,0 +1,58 @@ +apply plugin: 'java' +apply plugin: 'eclipse' +apply plugin: 'application' + +mainClassName = 'com.filbertkm.importer.Importer' + +sourceCompatibility = 1.7 +targetCompatibility = 1.7 + +def env = System.getenv() + +def cmdBranch = "git rev-parse --abbrev-ref HEAD" +def procBranch = cmdBranch.execute() +def gitBranch = procBranch.text.trim() + +def cmdVersion = "git rev-parse HEAD" +def procVersion = cmdVersion.execute() +def gitRevision = procVersion.text.trim() + +// basic setup for the script to run +buildscript { + + dependencies { + repositories { + mavenCentral() + } + } +} + +jar { + manifest { + attributes("Main-Class": "${mainClassName}", + "Git-Branch": ((null != gitBranch) ? gitBranch : ""), + "Git-Revision": ((null != gitRevision) ? gitRevision : "")) + } + from { configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } } +} + +// instruct Gradle to look in mavenCentral repository first, then the /lib directory +repositories { + mavenCentral() + flatDir { dirs 'lib' } +} + +dependencies { + compile "org.wikidata.wdtk:wdtk-datamodel:0.4.0" + compile "org.wikidata.wdtk:wdtk-dumpfiles:0.4.0" + compile "org.slf4j:slf4j-log4j12:1.7.6" + compile "args4j:args4j:2.32" + compile "org.postgresql:postgresql:9.3-1103-jdbc41" + compile "com.fasterxml.jackson.core:jackson-core:2.5.3" + compile "com.fasterxml.jackson.core:jackson-annotations:2.5.3" + compile "com.fasterxml.jackson.core:jackson-databind:2.5.3" +} + +test { + jvmArgs "-XX:-UseSplitVerifier" +} From c5e16e699f0c8a0d7bd68df392e443b5b06c7226 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sat, 11 Jul 2015 00:12:37 +0100 Subject: [PATCH 02/11] Move to more standard directory structure --- src/{ => main/java}/com/filbertkm/importer/Configuration.java | 0 src/{ => main/java}/com/filbertkm/importer/Importer.java | 0 src/{ => main/java}/com/filbertkm/importer/JsonDumpProcessor.java | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/{ => main/java}/com/filbertkm/importer/Configuration.java (100%) rename src/{ => main/java}/com/filbertkm/importer/Importer.java (100%) rename src/{ => main/java}/com/filbertkm/importer/JsonDumpProcessor.java (100%) diff --git a/src/com/filbertkm/importer/Configuration.java b/src/main/java/com/filbertkm/importer/Configuration.java similarity index 100% rename from src/com/filbertkm/importer/Configuration.java rename to src/main/java/com/filbertkm/importer/Configuration.java diff --git a/src/com/filbertkm/importer/Importer.java b/src/main/java/com/filbertkm/importer/Importer.java similarity index 100% rename from src/com/filbertkm/importer/Importer.java rename to src/main/java/com/filbertkm/importer/Importer.java diff --git a/src/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java similarity index 100% rename from src/com/filbertkm/importer/JsonDumpProcessor.java rename to src/main/java/com/filbertkm/importer/JsonDumpProcessor.java From 5fe526b6a0c8a98eaa9920099de75cd677a3b1b8 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sat, 11 Jul 2015 00:13:28 +0100 Subject: [PATCH 03/11] Fix bug in number of columns in DB query --- src/main/java/com/filbertkm/importer/JsonDumpProcessor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index a5bd5ea..bd9c444 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -90,8 +90,8 @@ private void extractDescriptions(ItemDocument itemDocument) { try { PreparedStatement pst = this.conn.prepareStatement(query); pst.setString(1, itemDocument.getEntityId().getId()); - pst.setString(3, description.getValue().getLanguageCode()); - pst.setString(4, description.getValue().getText()); + pst.setString(2, description.getValue().getLanguageCode()); + pst.setString(3, description.getValue().getText()); pst.executeUpdate(); } catch (SQLException e) { From 079570efc3c879322aafe3d90f09fa03dd841891 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sat, 11 Jul 2015 00:49:15 +0100 Subject: [PATCH 04/11] Allow for multiple of the same property value for a single entity --- sql/schema.sql | 6 +++--- .../filbertkm/importer/JsonDumpProcessor.java | 21 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/sql/schema.sql b/sql/schema.sql index bd5ba96..df27bf6 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -1,5 +1,4 @@ CREATE EXTENSION IF NOT EXISTS postgis; -CREATE EXTENSION IF NOT EXISTS hstore; CREATE TABLE coordinates( id serial PRIMARY KEY, @@ -12,10 +11,11 @@ CREATE TABLE coordinates( SELECT AddGeometryColumn ('public', 'coordinates', 'geom', 4326, 'POINT', 2); -CREATE TABLE value_snaks( +CREATE TABLE value( id serial PRIMARY KEY, entity_id VARCHAR, - values HSTORE + property_id VARCHAR, + value TEXT ); CREATE TABLE terms( diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index bd9c444..905ced2 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -27,7 +27,7 @@ public class JsonDumpProcessor implements EntityDocumentProcessor { private static final Logger logger = Logger.getLogger(Importer.class); - + private static final String HSTORE_SEPARATOR_TOKEN = "=>"; private Connection conn; @@ -121,7 +121,7 @@ private void extractSnaks(ItemDocument itemDocument) { ArrayList snaks = new ArrayList<>(); for (StatementGroup statementGroup : itemDocument.getStatementGroups()) { - String propertyId = statementGroup.getProperty().getId(); + String propertyId = statementGroup.getProperty().getId(); for (Statement statement : statementGroup.getStatements()) { if (statement.getClaim().getMainSnak() instanceof ValueSnak) { @@ -132,16 +132,18 @@ private void extractSnaks(ItemDocument itemDocument) { this.insertCoordinates(itemDocument, coordinates); } else if (value instanceof EntityIdValue) { EntityIdValue entityIdValue = (EntityIdValue)value; - snaks.add(this.buildEntityIdValueSnak(propertyId, entityIdValue)); + insertValueSnaks(itemDocument, propertyId, entityIdValue.getId()); +// snaks.add(this.buildEntityIdValueSnak(propertyId, entityIdValue)); } else if (value instanceof StringValue) { StringValue stringValue = (StringValue)value; - snaks.add(this.buildValueSnak(propertyId, stringValue.getString())); + insertValueSnaks(itemDocument, propertyId, stringValue.getString()); +// snaks.add(this.buildValueSnak(propertyId, stringValue.getString())); } } } } - insertValueSnaks(itemDocument, snaks); + //insertValueSnaks(itemDocument, snaks); } private void insertCoordinates(ItemDocument itemDocument, GlobeCoordinatesValue value) { @@ -180,13 +182,14 @@ private String buildValueSnak(String propertyId, String value) { return builder.toString(); } - private void insertValueSnaks(ItemDocument itemDocument, ArrayList snaks) { - String query = "INSERT INTO value_snaks (entity_id, values) VALUES(?,?)"; + private void insertValueSnaks(ItemDocument itemDocument, String propertyId, String value) { + String query = "INSERT INTO value (entity_id, property_id, value) VALUES(?,?,?)"; try { PreparedStatement pst = this.conn.prepareStatement(query); - pst.setString(1, itemDocument.getEntityId().getId()); - pst.setObject(2, buildSnaksString(snaks), Types.OTHER); + pst.setString(1, itemDocument.getEntityId().getId()); + pst.setString(2, propertyId); + pst.setString(3, value); System.out.println(pst.toString()); pst.executeUpdate(); } catch (SQLException e) { From c27a942577b11d2ec037e9e2d265c66f12bb3502 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sat, 11 Jul 2015 01:01:49 +0100 Subject: [PATCH 05/11] Use PreparedStatements correctly for performance --- .../filbertkm/importer/JsonDumpProcessor.java | 83 +++++++++++-------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index 905ced2..42e0a5d 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -31,9 +31,36 @@ public class JsonDumpProcessor implements EntityDocumentProcessor { private static final String HSTORE_SEPARATOR_TOKEN = "=>"; private Connection conn; + + private PreparedStatement pstInsertDescription; + private PreparedStatement pstInsertTerms; + private PreparedStatement pstInsertCoordinates; + private PreparedStatement pstInsertValue; public JsonDumpProcessor(Connection conn) { this.conn = conn; + + try { + String queryInsertDescriptions = "INSERT INTO descriptions (entity_id, term_language, term_text)" + + " VALUES(?, ?, ?)"; + pstInsertDescription = this.conn.prepareStatement(queryInsertDescriptions); + + String queryInsertTerms = "INSERT INTO terms (entity_id, term_type, term_language, term_text)" + + " VALUES(?, ?, ?, ?)"; + pstInsertTerms = this.conn.prepareStatement(queryInsertTerms); + + String queryInsertCoordinates = "INSERT INTO coordinates (entity_id, globe, precision, latitude, longitude)" + + " VALUES(?, ?, ?, ?, ?)"; + pstInsertCoordinates = this.conn.prepareStatement(queryInsertCoordinates); + + String queryInsertValue = "INSERT INTO value (entity_id, property_id, value) VALUES(?,?,?)"; + pstInsertValue = this.conn.prepareStatement(queryInsertValue); + + } catch (SQLException e) { + e.printStackTrace(); + } + + } public void processItemDocument(ItemDocument itemDocument) { @@ -83,17 +110,12 @@ private void extractAliases(ItemDocument itemDocument) { private void extractDescriptions(ItemDocument itemDocument) { Map descriptions = itemDocument.getDescriptions(); - String query = "INSERT INTO descriptions (entity_id, term_language, term_text)" - + " VALUES(?, ?, ?)"; - for (Map.Entry description : descriptions.entrySet()) { try { - PreparedStatement pst = this.conn.prepareStatement(query); - pst.setString(1, itemDocument.getEntityId().getId()); - pst.setString(2, description.getValue().getLanguageCode()); - pst.setString(3, description.getValue().getText()); - - pst.executeUpdate(); + pstInsertDescription.setString(1, itemDocument.getEntityId().getId()); + pstInsertDescription.setString(2, description.getValue().getLanguageCode()); + pstInsertDescription.setString(3, description.getValue().getText()); + pstInsertDescription.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); } @@ -101,17 +123,13 @@ private void extractDescriptions(ItemDocument itemDocument) { } private void addTermToDatabase(String itemId, String termType, String languageCode, String text) { - String query = "INSERT INTO terms (entity_id, term_type, term_language, term_text)" - + " VALUES(?, ?, ?, ?)"; try { - PreparedStatement pst = this.conn.prepareStatement(query); - pst.setString(1, itemId); - pst.setString(2, termType); - pst.setString(3, languageCode); - pst.setString(4, text); - - pst.executeUpdate(); + pstInsertTerms.setString(1, itemId); + pstInsertTerms.setString(2, termType); + pstInsertTerms.setString(3, languageCode); + pstInsertTerms.setString(4, text); + pstInsertTerms.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); } @@ -147,18 +165,14 @@ private void extractSnaks(ItemDocument itemDocument) { } private void insertCoordinates(ItemDocument itemDocument, GlobeCoordinatesValue value) { - String query = "INSERT INTO coordinates (entity_id, globe, precision, latitude, longitude)" - + " VALUES(?, ?, ?, ?, ?)"; - + try { - PreparedStatement pst = this.conn.prepareStatement(query); - pst.setString(1, itemDocument.getEntityId().getId()); - pst.setString(2, value.getGlobe()); - pst.setDouble(3, value.getPrecision()); - pst.setDouble(4, value.getLatitude()); - pst.setDouble(5, value.getLongitude()); - - pst.executeUpdate(); + pstInsertCoordinates.setString(1, itemDocument.getEntityId().getId()); + pstInsertCoordinates.setString(2, value.getGlobe()); + pstInsertCoordinates.setDouble(3, value.getPrecision()); + pstInsertCoordinates.setDouble(4, value.getLatitude()); + pstInsertCoordinates.setDouble(5, value.getLongitude()); + pstInsertCoordinates.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); } @@ -183,15 +197,12 @@ private String buildValueSnak(String propertyId, String value) { } private void insertValueSnaks(ItemDocument itemDocument, String propertyId, String value) { - String query = "INSERT INTO value (entity_id, property_id, value) VALUES(?,?,?)"; try { - PreparedStatement pst = this.conn.prepareStatement(query); - pst.setString(1, itemDocument.getEntityId().getId()); - pst.setString(2, propertyId); - pst.setString(3, value); - System.out.println(pst.toString()); - pst.executeUpdate(); + pstInsertValue.setString(1, itemDocument.getEntityId().getId()); + pstInsertValue.setString(2, propertyId); + pstInsertValue.setString(3, value); + pstInsertValue.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); } From e0baf6ee003a5e33e220b9d22f821b0f33f6a1e9 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Sun, 12 Jul 2015 19:46:45 +0100 Subject: [PATCH 06/11] add dbHost param --- src/main/java/com/filbertkm/importer/Configuration.java | 7 +++++++ src/main/java/com/filbertkm/importer/Importer.java | 9 ++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/Configuration.java b/src/main/java/com/filbertkm/importer/Configuration.java index a0e8190..e445d3b 100644 --- a/src/main/java/com/filbertkm/importer/Configuration.java +++ b/src/main/java/com/filbertkm/importer/Configuration.java @@ -4,6 +4,9 @@ public class Configuration { + @Option(name = "-dbhost", usage = "database host", required = true) + private String dbhost; + @Option(name = "-dbuser", usage = "database user", required = true) private String dbuser; @@ -16,6 +19,10 @@ public class Configuration { @Option(name = "-dumpdir", usage = "dump directory", required = true) private String dumpdir; + public String getDBHost() { + return dbhost; + } + public String getDbUser() { return dbuser; } diff --git a/src/main/java/com/filbertkm/importer/Importer.java b/src/main/java/com/filbertkm/importer/Importer.java index b9eeb1b..703f3ec 100644 --- a/src/main/java/com/filbertkm/importer/Importer.java +++ b/src/main/java/com/filbertkm/importer/Importer.java @@ -13,6 +13,8 @@ public class Importer { private Connection conn; + private String dbHost; + private String dbUser; private String dbName; @@ -26,7 +28,7 @@ public static void main(String[] args) { try { parser.parseArgument(args); - Importer importer = new Importer(config.getDbUser(), config.getDbName(), config.getDbPass()); + Importer importer = new Importer(config.getDBHost(), config.getDbUser(), config.getDbName(), config.getDbPass()); importer.process("wikidatawiki", config.getDumpDir()); } catch (CmdLineException e) { // omg! @@ -37,7 +39,8 @@ public static void main(String[] args) { System.out.println("done"); } - public Importer(String dbUser, String dbName, String dbPass) { + public Importer(String dbHost, String dbUser, String dbName, String dbPass) { + this.dbHost = dbHost; this.dbUser = dbUser; this.dbName = dbName; this.dbPass = dbPass; @@ -64,7 +67,7 @@ private Connection getConnection() { if (this.conn == null) { try { this.conn = DriverManager.getConnection( - "jdbc:postgresql://127.0.0.1:5432/" + this.dbName, + "jdbc:postgresql://" + this.dbHost + ":5432/" + this.dbName, this.dbUser, this.dbPass ); From dac3abdb514348c340f30941c0b3d12b88a002ab Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Mon, 13 Jul 2015 15:20:25 +0100 Subject: [PATCH 07/11] Switch to using batch mode to push data to postgresql for performance --- .../java/com/filbertkm/importer/Importer.java | 1 + .../filbertkm/importer/JsonDumpProcessor.java | 27 ++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/Importer.java b/src/main/java/com/filbertkm/importer/Importer.java index 703f3ec..d3a2156 100644 --- a/src/main/java/com/filbertkm/importer/Importer.java +++ b/src/main/java/com/filbertkm/importer/Importer.java @@ -57,6 +57,7 @@ public void process(String wikiId, String dumpDirectory) { dumpProcessingController.setDownloadDirectory(dumpDirectory); dumpProcessingController.registerEntityDocumentProcessor(jsonDumpProcessor, null, true); dumpProcessingController.processMostRecentJsonDump(); + jsonDumpProcessor.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index 42e0a5d..99aa12a 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -36,6 +36,9 @@ public class JsonDumpProcessor implements EntityDocumentProcessor { private PreparedStatement pstInsertTerms; private PreparedStatement pstInsertCoordinates; private PreparedStatement pstInsertValue; + + private int documentCount = 0; + private int documentBatchSize = 1000; public JsonDumpProcessor(Connection conn) { this.conn = conn; @@ -69,6 +72,22 @@ public void processItemDocument(ItemDocument itemDocument) { extractTerms(itemDocument); extractSnaks(itemDocument); + + documentCount++; + if (documentCount > documentBatchSize) { + flush(); + } + } + + public void flush() { + try { + pstInsertDescription.executeBatch(); + pstInsertTerms.executeBatch(); + pstInsertCoordinates.executeBatch(); + pstInsertValue.executeBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } } private void extractTerms(ItemDocument itemDocument) { @@ -115,7 +134,7 @@ private void extractDescriptions(ItemDocument itemDocument) { pstInsertDescription.setString(1, itemDocument.getEntityId().getId()); pstInsertDescription.setString(2, description.getValue().getLanguageCode()); pstInsertDescription.setString(3, description.getValue().getText()); - pstInsertDescription.executeUpdate(); + pstInsertDescription.addBatch(); } catch (SQLException e) { e.printStackTrace(); } @@ -129,7 +148,7 @@ private void addTermToDatabase(String itemId, String termType, String languageCo pstInsertTerms.setString(2, termType); pstInsertTerms.setString(3, languageCode); pstInsertTerms.setString(4, text); - pstInsertTerms.executeUpdate(); + pstInsertTerms.addBatch(); } catch (SQLException e) { e.printStackTrace(); } @@ -172,7 +191,7 @@ private void insertCoordinates(ItemDocument itemDocument, GlobeCoordinatesValue pstInsertCoordinates.setDouble(3, value.getPrecision()); pstInsertCoordinates.setDouble(4, value.getLatitude()); pstInsertCoordinates.setDouble(5, value.getLongitude()); - pstInsertCoordinates.executeUpdate(); + pstInsertCoordinates.addBatch(); } catch (SQLException e) { e.printStackTrace(); } @@ -202,7 +221,7 @@ private void insertValueSnaks(ItemDocument itemDocument, String propertyId, Stri pstInsertValue.setString(1, itemDocument.getEntityId().getId()); pstInsertValue.setString(2, propertyId); pstInsertValue.setString(3, value); - pstInsertValue.executeUpdate(); + pstInsertValue.addBatch(); } catch (SQLException e) { e.printStackTrace(); } From 21fa5cffbf8bbab5ac59d99c5dac55468f8f3784 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Tue, 14 Jul 2015 06:35:37 +0100 Subject: [PATCH 08/11] Split out tables into a table per type. Add DateTime claims. Add property value to all claims. Add sitelinks. --- sql/schema.sql | 67 +++-- .../filbertkm/importer/JsonDumpProcessor.java | 232 ++++++++++-------- 2 files changed, 180 insertions(+), 119 deletions(-) diff --git a/sql/schema.sql b/sql/schema.sql index df27bf6..4bf84a4 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -1,34 +1,71 @@ +CREATE TABLE label( + id serial PRIMARY KEY, + entity_id TEXT NOT NULL, + label_language TEXT, + label_text TEXT +); + +CREATE TABLE alias( + id serial PRIMARY KEY, + entity_id TEXT NOT NULL, + alias_language TEXT, + alias_text TEXT +); + +CREATE TABLE description( + id serial PRIMARY KEY, + entity_id TEXT NOT NULL, + description_language TEXT, + description_text TEXT +); + +CREATE TABLE sitelink( + id serial PRIMARY KEY, + entity_id TEXT NOT NULL, + site_key TEXT, + page_title TEXT +); + CREATE EXTENSION IF NOT EXISTS postgis; -CREATE TABLE coordinates( +CREATE TABLE claim_coordinate( id serial PRIMARY KEY, entity_id TEXT NOT NULL, + property_id TEXT NOT NULL, globe TEXT default NULL, precision double precision, latitude double precision, longitude double precision ); -SELECT AddGeometryColumn ('public', 'coordinates', 'geom', 4326, 'POINT', 2); +SELECT AddGeometryColumn ('public', 'claim_coordinate', 'geom', 4326, 'POINT', 2); -CREATE TABLE value( +CREATE TABLE claim_datetime( id serial PRIMARY KEY, - entity_id VARCHAR, - property_id VARCHAR, - value TEXT + entity_id TEXT NOT NULL, + property_id TEXT NOT NULL, + calendar TEXT default NULL, + year text, + month text, + day text, + hour text, + minute text, + second text, + precision text, + tolerance_before text, + tolerance_after text ); -CREATE TABLE terms( +CREATE TABLE claim_entity( id serial PRIMARY KEY, - entity_id TEXT NOT NULL, - term_type TEXT, - term_language TEXT, - term_text TEXT + entity_id VARCHAR, + property_id VARCHAR, + value TEXT ); -CREATE TABLE descriptions( +CREATE TABLE claim_string( id serial PRIMARY KEY, - entity_id TEXT NOT NULL, - term_language TEXT, - term_text TEXT + entity_id VARCHAR, + property_id VARCHAR, + value TEXT ); diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index 99aa12a..7da70ae 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -3,7 +3,6 @@ import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; -import java.sql.Types; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -18,24 +17,28 @@ import org.wikidata.wdtk.datamodel.interfaces.ItemDocument; import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; +import org.wikidata.wdtk.datamodel.interfaces.SiteLink; import org.wikidata.wdtk.datamodel.interfaces.Statement; import org.wikidata.wdtk.datamodel.interfaces.StatementGroup; import org.wikidata.wdtk.datamodel.interfaces.StringValue; +import org.wikidata.wdtk.datamodel.interfaces.TimeValue; import org.wikidata.wdtk.datamodel.interfaces.Value; import org.wikidata.wdtk.datamodel.interfaces.ValueSnak; public class JsonDumpProcessor implements EntityDocumentProcessor { private static final Logger logger = Logger.getLogger(Importer.class); - - private static final String HSTORE_SEPARATOR_TOKEN = "=>"; private Connection conn; + private PreparedStatement pstInsertLabel; + private PreparedStatement pstInsertAlias; private PreparedStatement pstInsertDescription; - private PreparedStatement pstInsertTerms; - private PreparedStatement pstInsertCoordinates; - private PreparedStatement pstInsertValue; + private PreparedStatement pstInsertSiteLink; + private PreparedStatement pstInsertClauseCoordinates; + private PreparedStatement pstInsertClauseDateTime; + private PreparedStatement pstInsertClauseEntity; + private PreparedStatement pstInsertClauseString; private int documentCount = 0; private int documentBatchSize = 1000; @@ -44,34 +47,50 @@ public JsonDumpProcessor(Connection conn) { this.conn = conn; try { - String queryInsertDescriptions = "INSERT INTO descriptions (entity_id, term_language, term_text)" + String queryInsertLabel = "INSERT INTO label (entity_id, label_language, label_text)" + " VALUES(?, ?, ?)"; - pstInsertDescription = this.conn.prepareStatement(queryInsertDescriptions); + pstInsertLabel = this.conn.prepareStatement(queryInsertLabel); + + String queryInsertAlias = "INSERT INTO alias (entity_id, alias_language, alias_text)" + + " VALUES(?, ?, ?)"; + pstInsertAlias = this.conn.prepareStatement(queryInsertAlias); - String queryInsertTerms = "INSERT INTO terms (entity_id, term_type, term_language, term_text)" - + " VALUES(?, ?, ?, ?)"; - pstInsertTerms = this.conn.prepareStatement(queryInsertTerms); + String queryInsertDescriptions = "INSERT INTO description (entity_id, description_language, description_text)" + + " VALUES(?, ?, ?)"; + pstInsertDescription = this.conn.prepareStatement(queryInsertDescriptions); + + String queryInsertSiteLink = "INSERT INTO sitelink (entity_id, site_key, page_title)" + + " VALUES(?, ?, ?)"; + pstInsertSiteLink = this.conn.prepareStatement(queryInsertSiteLink); - String queryInsertCoordinates = "INSERT INTO coordinates (entity_id, globe, precision, latitude, longitude)" - + " VALUES(?, ?, ?, ?, ?)"; - pstInsertCoordinates = this.conn.prepareStatement(queryInsertCoordinates); + String queryInsertClauseCoordinates = "INSERT INTO claim_coordinate (entity_id, property_id, globe, precision, latitude, longitude)" + + " VALUES(?, ?, ?, ?, ?, ?)"; + pstInsertClauseCoordinates = this.conn.prepareStatement(queryInsertClauseCoordinates); + + String queryInsertClauseDateTime = "INSERT INTO claim_datetime (entity_id, property_id, calendar, year, month, day, hour, minute, second, precision, tolerance_before, tolerance_after)" + + " VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; + pstInsertClauseDateTime = this.conn.prepareStatement(queryInsertClauseDateTime); - String queryInsertValue = "INSERT INTO value (entity_id, property_id, value) VALUES(?,?,?)"; - pstInsertValue = this.conn.prepareStatement(queryInsertValue); + String queryInsertClauseEntity = "INSERT INTO claim_entity (entity_id, property_id, value) VALUES (?,?,?)"; + pstInsertClauseEntity = this.conn.prepareStatement(queryInsertClauseEntity); + + String queryInsertClauseString = "INSERT INTO claim_string (entity_id, property_id, value) VALUES (?,?,?)"; + pstInsertClauseString = this.conn.prepareStatement(queryInsertClauseString); } catch (SQLException e) { e.printStackTrace(); } - - } public void processItemDocument(ItemDocument itemDocument) { String itemId = itemDocument.getEntityId().getId(); logger.info("Processing: " + itemId); - - extractTerms(itemDocument); - extractSnaks(itemDocument); + + extractLabels(itemDocument); + extractAliases(itemDocument); + extractDescriptions(itemDocument); + extractSiteLinks(itemDocument); + extractClaims(itemDocument); documentCount++; if (documentCount > documentBatchSize) { @@ -81,31 +100,31 @@ public void processItemDocument(ItemDocument itemDocument) { public void flush() { try { + pstInsertLabel.executeBatch(); + pstInsertAlias.executeBatch(); pstInsertDescription.executeBatch(); - pstInsertTerms.executeBatch(); - pstInsertCoordinates.executeBatch(); - pstInsertValue.executeBatch(); + pstInsertSiteLink.executeBatch(); + pstInsertClauseCoordinates.executeBatch(); + pstInsertClauseDateTime.executeBatch(); + pstInsertClauseEntity.executeBatch(); + pstInsertClauseString.executeBatch(); } catch (SQLException e) { e.printStackTrace(); } } - - private void extractTerms(ItemDocument itemDocument) { - extractLabels(itemDocument); - extractAliases(itemDocument); - extractDescriptions(itemDocument); - } - + private void extractLabels(ItemDocument itemDocument) { Map labels = itemDocument.getLabels(); for (Map.Entry label : labels.entrySet()) { - addTermToDatabase( - itemDocument.getEntityId().getId(), - "label", - label.getValue().getLanguageCode(), - label.getValue().getText() - ); + try { + pstInsertLabel.setString(1, itemDocument.getEntityId().getId()); + pstInsertLabel.setString(2, label.getValue().getLanguageCode()); + pstInsertLabel.setString(3, label.getValue().getText()); + pstInsertLabel.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } } } @@ -116,12 +135,15 @@ private void extractAliases(ItemDocument itemDocument) { List languageAliases = aliasMap.getValue(); for (MonolingualTextValue alias : languageAliases) { - addTermToDatabase( - itemDocument.getEntityId().getId(), - "alias", - alias.getLanguageCode(), - alias.getText() - ); + + try { + pstInsertAlias.setString(1, itemDocument.getEntityId().getId()); + pstInsertAlias.setString(2, alias.getLanguageCode()); + pstInsertAlias.setString(3, alias.getText()); + pstInsertAlias.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } } } } @@ -140,25 +162,27 @@ private void extractDescriptions(ItemDocument itemDocument) { } } } - - private void addTermToDatabase(String itemId, String termType, String languageCode, String text) { + + private void extractSiteLinks(ItemDocument itemDocument) { + Map sitelinks = itemDocument.getSiteLinks(); - try { - pstInsertTerms.setString(1, itemId); - pstInsertTerms.setString(2, termType); - pstInsertTerms.setString(3, languageCode); - pstInsertTerms.setString(4, text); - pstInsertTerms.addBatch(); - } catch (SQLException e) { - e.printStackTrace(); - } + for (Map.Entry sitelink : sitelinks.entrySet()) { + try { + pstInsertSiteLink.setString(1, itemDocument.getEntityId().getId()); + pstInsertSiteLink.setString(2, sitelink.getValue().getSiteKey()); + pstInsertSiteLink.setString(3, sitelink.getValue().getPageTitle()); + // getBadges + pstInsertSiteLink.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } + } } + + private void extractClaims(ItemDocument itemDocument) { - private void extractSnaks(ItemDocument itemDocument) { - ArrayList snaks = new ArrayList<>(); - for (StatementGroup statementGroup : itemDocument.getStatementGroups()) { - String propertyId = statementGroup.getProperty().getId(); + String propertyId = statementGroup.getProperty().getId(); for (Statement statement : statementGroup.getStatements()) { if (statement.getClaim().getMainSnak() instanceof ValueSnak) { @@ -166,79 +190,79 @@ private void extractSnaks(ItemDocument itemDocument) { if (value instanceof GlobeCoordinatesValue) { GlobeCoordinatesValue coordinates = (GlobeCoordinatesValue)value; - this.insertCoordinates(itemDocument, coordinates); + insertCoordinates(itemDocument, propertyId, coordinates); + } else if (value instanceof TimeValue) { + TimeValue timeValue = (TimeValue)value; + insertDateTime(itemDocument, propertyId, timeValue); } else if (value instanceof EntityIdValue) { EntityIdValue entityIdValue = (EntityIdValue)value; - insertValueSnaks(itemDocument, propertyId, entityIdValue.getId()); -// snaks.add(this.buildEntityIdValueSnak(propertyId, entityIdValue)); + insertEntity(itemDocument, propertyId, entityIdValue.getId()); } else if (value instanceof StringValue) { StringValue stringValue = (StringValue)value; - insertValueSnaks(itemDocument, propertyId, stringValue.getString()); -// snaks.add(this.buildValueSnak(propertyId, stringValue.getString())); + insertString(itemDocument, propertyId, stringValue.getString()); } } } } - - //insertValueSnaks(itemDocument, snaks); } - private void insertCoordinates(ItemDocument itemDocument, GlobeCoordinatesValue value) { + private void insertCoordinates(ItemDocument itemDocument, String propertyId, GlobeCoordinatesValue value) { try { - pstInsertCoordinates.setString(1, itemDocument.getEntityId().getId()); - pstInsertCoordinates.setString(2, value.getGlobe()); - pstInsertCoordinates.setDouble(3, value.getPrecision()); - pstInsertCoordinates.setDouble(4, value.getLatitude()); - pstInsertCoordinates.setDouble(5, value.getLongitude()); - pstInsertCoordinates.addBatch(); + pstInsertClauseCoordinates.setString(1, itemDocument.getEntityId().getId()); + pstInsertClauseCoordinates.setString(2, propertyId); + pstInsertClauseCoordinates.setString(3, value.getGlobe()); + pstInsertClauseCoordinates.setDouble(4, value.getPrecision()); + pstInsertClauseCoordinates.setDouble(5, value.getLatitude()); + pstInsertClauseCoordinates.setDouble(6, value.getLongitude()); + pstInsertClauseCoordinates.addBatch(); } catch (SQLException e) { e.printStackTrace(); } } - private String buildEntityIdValueSnak(String propertyId, EntityIdValue value) { - return this.buildValueSnak( - propertyId, - value.getId() - ); - } - - private String buildValueSnak(String propertyId, String value) { - final StringBuilder builder = new StringBuilder(); - builder.append(propertyId); - builder.append(HSTORE_SEPARATOR_TOKEN); - builder.append("\""); - builder.append(value); - builder.append("\""); - - return builder.toString(); + private void insertDateTime(ItemDocument itemDocument, String propertyId, TimeValue value) { + try { + pstInsertClauseDateTime.setString(1, itemDocument.getEntityId().getId()); + pstInsertClauseDateTime.setString(2, propertyId); + pstInsertClauseDateTime.setString(3, value.getPreferredCalendarModel()); + pstInsertClauseDateTime.setDouble(4, value.getYear()); + pstInsertClauseDateTime.setDouble(5, value.getMonth()); + pstInsertClauseDateTime.setDouble(6, value.getDay()); + pstInsertClauseDateTime.setDouble(7, value.getHour()); + pstInsertClauseDateTime.setDouble(8, value.getMinute()); + pstInsertClauseDateTime.setDouble(9, value.getSecond()); + pstInsertClauseDateTime.setDouble(10, value.getPrecision()); + pstInsertClauseDateTime.setDouble(11, value.getBeforeTolerance()); + pstInsertClauseDateTime.setDouble(12, value.getAfterTolerance()); + pstInsertClauseDateTime.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } } - private void insertValueSnaks(ItemDocument itemDocument, String propertyId, String value) { - + private void insertString(ItemDocument itemDocument, String propertyId, String value) { + try { - pstInsertValue.setString(1, itemDocument.getEntityId().getId()); - pstInsertValue.setString(2, propertyId); - pstInsertValue.setString(3, value); - pstInsertValue.addBatch(); + pstInsertClauseString.setString(1, itemDocument.getEntityId().getId()); + pstInsertClauseString.setString(2, propertyId); + pstInsertClauseString.setString(3, value); + pstInsertClauseString.addBatch(); } catch (SQLException e) { e.printStackTrace(); } } - private String buildSnaksString(ArrayList snaks) { - String snakString = ""; + private void insertEntity(ItemDocument itemDocument, String propertyId, String value) { - for (int i = 0; i < snaks.size() - 1; i++) { - if ( i == 0 ) { - snakString = snakString + snaks.get(i); - } else { - snakString = snakString + ", " + snaks.get(i); - } + try { + pstInsertClauseEntity.setString(1, itemDocument.getEntityId().getId()); + pstInsertClauseEntity.setString(2, propertyId); + pstInsertClauseEntity.setString(3, value); + pstInsertClauseEntity.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); } - - return snakString; } public void processPropertyDocument(PropertyDocument arg0) { From 9a0efb07b3741b6c28fcb291a154cbf9fc91dc37 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Thu, 16 Jul 2015 00:46:58 +0100 Subject: [PATCH 09/11] Extend to import properties as well as items --- .../filbertkm/importer/JsonDumpProcessor.java | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index 7da70ae..94c6b85 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -3,7 +3,6 @@ import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; -import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -19,8 +18,10 @@ import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument; import org.wikidata.wdtk.datamodel.interfaces.SiteLink; import org.wikidata.wdtk.datamodel.interfaces.Statement; +import org.wikidata.wdtk.datamodel.interfaces.StatementDocument; import org.wikidata.wdtk.datamodel.interfaces.StatementGroup; import org.wikidata.wdtk.datamodel.interfaces.StringValue; +import org.wikidata.wdtk.datamodel.interfaces.TermedDocument; import org.wikidata.wdtk.datamodel.interfaces.TimeValue; import org.wikidata.wdtk.datamodel.interfaces.Value; import org.wikidata.wdtk.datamodel.interfaces.ValueSnak; @@ -98,6 +99,18 @@ public void processItemDocument(ItemDocument itemDocument) { } } + public void processPropertyDocument(PropertyDocument propertyDoc) { + extractLabels(propertyDoc); + extractAliases(propertyDoc); + extractDescriptions(propertyDoc); + extractClaims(propertyDoc); + + documentCount++; + if (documentCount > documentBatchSize) { + flush(); + } + } + public void flush() { try { pstInsertLabel.executeBatch(); @@ -113,7 +126,7 @@ public void flush() { } } - private void extractLabels(ItemDocument itemDocument) { + private void extractLabels(TermedDocument itemDocument) { Map labels = itemDocument.getLabels(); for (Map.Entry label : labels.entrySet()) { @@ -128,7 +141,7 @@ private void extractLabels(ItemDocument itemDocument) { } } - private void extractAliases(ItemDocument itemDocument) { + private void extractAliases(TermedDocument itemDocument) { Map> aliases = itemDocument.getAliases(); for (Map.Entry> aliasMap : aliases.entrySet()) { @@ -148,7 +161,7 @@ private void extractAliases(ItemDocument itemDocument) { } } - private void extractDescriptions(ItemDocument itemDocument) { + private void extractDescriptions(TermedDocument itemDocument) { Map descriptions = itemDocument.getDescriptions(); for (Map.Entry description : descriptions.entrySet()) { @@ -179,7 +192,7 @@ private void extractSiteLinks(ItemDocument itemDocument) { } } - private void extractClaims(ItemDocument itemDocument) { + private void extractClaims(StatementDocument itemDocument) { for (StatementGroup statementGroup : itemDocument.getStatementGroups()) { String propertyId = statementGroup.getProperty().getId(); @@ -206,7 +219,7 @@ private void extractClaims(ItemDocument itemDocument) { } } - private void insertCoordinates(ItemDocument itemDocument, String propertyId, GlobeCoordinatesValue value) { + private void insertCoordinates(StatementDocument itemDocument, String propertyId, GlobeCoordinatesValue value) { try { pstInsertClauseCoordinates.setString(1, itemDocument.getEntityId().getId()); @@ -221,7 +234,7 @@ private void insertCoordinates(ItemDocument itemDocument, String propertyId, Glo } } - private void insertDateTime(ItemDocument itemDocument, String propertyId, TimeValue value) { + private void insertDateTime(StatementDocument itemDocument, String propertyId, TimeValue value) { try { pstInsertClauseDateTime.setString(1, itemDocument.getEntityId().getId()); pstInsertClauseDateTime.setString(2, propertyId); @@ -241,7 +254,7 @@ private void insertDateTime(ItemDocument itemDocument, String propertyId, TimeVa } } - private void insertString(ItemDocument itemDocument, String propertyId, String value) { + private void insertString(StatementDocument itemDocument, String propertyId, String value) { try { pstInsertClauseString.setString(1, itemDocument.getEntityId().getId()); @@ -253,7 +266,7 @@ private void insertString(ItemDocument itemDocument, String propertyId, String v } } - private void insertEntity(ItemDocument itemDocument, String propertyId, String value) { + private void insertEntity(StatementDocument itemDocument, String propertyId, String value) { try { pstInsertClauseEntity.setString(1, itemDocument.getEntityId().getId()); @@ -265,11 +278,6 @@ private void insertEntity(ItemDocument itemDocument, String propertyId, String v } } - public void processPropertyDocument(PropertyDocument arg0) { - // TODO Auto-generated method stub - - } - public static void configureLogging() { // Create the appender that will write log messages to the console. ConsoleAppender consoleAppender = new ConsoleAppender(); From 3fd61e49a82f8ebd2fedd6b12fdc5ab5026b7468 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Thu, 16 Jul 2015 00:57:44 +0100 Subject: [PATCH 10/11] Add indexes after data import --- sql/index.sql | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 sql/index.sql diff --git a/sql/index.sql b/sql/index.sql new file mode 100644 index 0000000..6b94e53 --- /dev/null +++ b/sql/index.sql @@ -0,0 +1,8 @@ +create index idx_label on label using btree (entity_id); +create index idx_alias on alias using btree (entity_id); +create index idx_description on description using btree (entity_id); +create index idx_claim_coordinate on claim_coordinate using btree (entity_id); +create index idx_claim_datetime on claim_datetime using btree (entity_id); +create index idx_claim_entity on claim_entity using btree (entity_id); +create index idx_claim_string on claim_string using btree (entity_id); +create index idx_sitelink on sitelink using btree (entity_id); From 5f28a6f2de136d400dd8e978824bb77ddda89417 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Thu, 16 Jul 2015 19:55:46 +0100 Subject: [PATCH 11/11] Ceanup after PropertyDocument changes --- .../com/filbertkm/importer/JsonDumpProcessor.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java index 94c6b85..8de21e2 100644 --- a/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java +++ b/src/main/java/com/filbertkm/importer/JsonDumpProcessor.java @@ -93,22 +93,26 @@ public void processItemDocument(ItemDocument itemDocument) { extractSiteLinks(itemDocument); extractClaims(itemDocument); - documentCount++; - if (documentCount > documentBatchSize) { - flush(); - } + flushBatch(); } public void processPropertyDocument(PropertyDocument propertyDoc) { + String itemId = propertyDoc.getEntityId().getId(); + logger.info("Processing: " + itemId); + extractLabels(propertyDoc); extractAliases(propertyDoc); extractDescriptions(propertyDoc); extractClaims(propertyDoc); + flushBatch(); + } + + private void flushBatch() { documentCount++; if (documentCount > documentBatchSize) { flush(); - } + } } public void flush() {