From fd1f4bf370680ea11f4058be725a2b35e7f42fed Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Thu, 28 May 2015 17:08:48 -0600 Subject: [PATCH 1/9] Whitespace cleanup... --- .../corpus/data/HTMLStatusExtractor.java | 256 +++++++++--------- 1 file changed, 128 insertions(+), 128 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java index 5c7bf81..0823eae 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java @@ -32,139 +32,139 @@ public class HTMLStatusExtractor { - public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); - - public HTMLStatusExtractor() { - date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); + public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); + + public HTMLStatusExtractor() { + date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + public static Map splitQuery(URL url) + throws java.io.UnsupportedEncodingException { + Map query_pairs = new LinkedHashMap(); + String query = url.getQuery(); + String[] pairs = query.split("&"); + for (String pair : pairs) { + int idx = pair.indexOf("="); + query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), + URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); + } + return query_pairs; + } + + public JsonObject extractTweet(String html) + throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { + JsonObject status = new JsonObject(); + + Document doc = Jsoup.parse(html); + Element tweet_div = doc.select("div.permalink-tweet").first(); + + String tweet_text = tweet_div.select("p.tweet-text").first().text(); + status.addProperty("text", tweet_text); + + String tweet_id = tweet_div.attr("data-tweet-id"); + status.addProperty("id_str", tweet_id); + status.addProperty("id", Long.parseLong(tweet_id)); + + String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); + Date created_at = new Date(); + created_at.setTime(Long.parseLong(timestamp) * 1000); + status.addProperty("created_at", date_fmt.format(created_at)); + + Elements js_stats_retweets = doc.select("li.js-stat-retweets"); + if (!js_stats_retweets.isEmpty()) { + status.addProperty("retweeted", true); + String count = js_stats_retweets.select("strong").first().text(); + status.addProperty("retweet_count", Long.parseLong(count)); + } else { + status.addProperty("retweeted", false); + status.addProperty("retweet_count", 0); } + Elements js_stats_favs = doc.select("li.js-stat-favorites"); + status.addProperty("favorited", !js_stats_favs.isEmpty()); + + + // User subfield + JsonObject user = new JsonObject(); + String user_id = tweet_div.attr("data-user-id"); + user.addProperty("id_str", user_id); + user.addProperty("id", Long.parseLong(user_id)); + String screen_name = tweet_div.attr("data-screen-name"); + user.addProperty("screen_name", screen_name); + String user_name = tweet_div.attr("data-name"); + user.addProperty("name", user_name); + + status.add("user", user); + + // Geo information + Elements tweet_loc = doc.select("a.tweet-geo-text"); + if (!tweet_loc.isEmpty()) { + JsonObject location = new JsonObject(); + Element loc = tweet_loc.first(); + // Adding http to avoid malformed URL exception + URL url = new URL("http:" + loc.attr("href")); + Map query_params = HTMLStatusExtractor.splitQuery(url); + // Loop over possible query parameters + // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html + String lat_and_long = null; + if ((lat_and_long = query_params.get("ll")) != null + || (lat_and_long = query_params.get("sll")) != null + || (lat_and_long = query_params.get("cbll")) != null + || (lat_and_long = query_params.get("q")) != null) { + String[] coordinates = lat_and_long.split(","); + double latitude = Double.parseDouble(coordinates[0]); + double longitude = Double.parseDouble(coordinates[1]); + location.addProperty("latitude", latitude); + location.addProperty("longitude", longitude); + } + location.addProperty("location_text", loc.text()); + status.add("location", location); + } + + return status; + } - public static Map splitQuery(URL url) - throws java.io.UnsupportedEncodingException { - Map query_pairs = new LinkedHashMap(); - String query = url.getQuery(); - String[] pairs = query.split("&"); - for (String pair : pairs) { - int idx = pair.indexOf("="); - query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), - URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); - } - return query_pairs; + private static final String HTML_OPTION = "html"; + + @SuppressWarnings("static-access") + public static void main(String[] args) throws Exception { + Options options = new Options(); + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("HTML file from twitter.com").create(HTML_OPTION)); + + CommandLine cmdline = null; + CommandLineParser parser = new GnuParser(); + try { + cmdline = parser.parse(options, args); + } catch (ParseException exp) { + System.err.println("Error parsing command line: " + exp.getMessage()); + System.exit(-1); } - public JsonObject extractTweet(String html) - throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { - JsonObject status = new JsonObject(); - - Document doc = Jsoup.parse(html); - Element tweet_div = doc.select("div.permalink-tweet").first(); - - String tweet_text = tweet_div.select("p.tweet-text").first().text(); - status.addProperty("text", tweet_text); - - String tweet_id = tweet_div.attr("data-tweet-id"); - status.addProperty("id_str", tweet_id); - status.addProperty("id", Long.parseLong(tweet_id)); - - String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); - Date created_at = new Date(); - created_at.setTime(Long.parseLong(timestamp) * 1000); - status.addProperty("created_at", date_fmt.format(created_at)); - - Elements js_stats_retweets = doc.select("li.js-stat-retweets"); - if (!js_stats_retweets.isEmpty()) { - status.addProperty("retweeted", true); - String count = js_stats_retweets.select("strong").first().text(); - status.addProperty("retweet_count", Long.parseLong(count)); - } else { - status.addProperty("retweeted", false); - status.addProperty("retweet_count", 0); - } - Elements js_stats_favs = doc.select("li.js-stat-favorites"); - status.addProperty("favorited", !js_stats_favs.isEmpty()); - - - // User subfield - JsonObject user = new JsonObject(); - String user_id = tweet_div.attr("data-user-id"); - user.addProperty("id_str", user_id); - user.addProperty("id", Long.parseLong(user_id)); - String screen_name = tweet_div.attr("data-screen-name"); - user.addProperty("screen_name", screen_name); - String user_name = tweet_div.attr("data-name"); - user.addProperty("name", user_name); - - status.add("user", user); - - // Geo information - Elements tweet_loc = doc.select("a.tweet-geo-text"); - if (!tweet_loc.isEmpty()) { - JsonObject location = new JsonObject(); - Element loc = tweet_loc.first(); - // Adding http to avoid malformed URL exception - URL url = new URL("http:" + loc.attr("href")); - Map query_params = HTMLStatusExtractor.splitQuery(url); - // Loop over possible query parameters - // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html - String lat_and_long = null; - if ((lat_and_long = query_params.get("ll")) != null - || (lat_and_long = query_params.get("sll")) != null - || (lat_and_long = query_params.get("cbll")) != null - || (lat_and_long = query_params.get("q")) != null) { - String[] coordinates = lat_and_long.split(","); - double latitude = Double.parseDouble(coordinates[0]); - double longitude = Double.parseDouble(coordinates[1]); - location.addProperty("latitude", latitude); - location.addProperty("longitude", longitude); - } - location.addProperty("location_text", loc.text()); - status.add("location", location); - } - - return status; + if (!cmdline.hasOption(HTML_OPTION)) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(HTMLStatusExtractor.class.getName(), options); + System.exit(-1); } - private static final String HTML_OPTION = "html"; - - @SuppressWarnings("static-access") - public static void main(String[] args) throws Exception { - Options options = new Options(); - options.addOption(OptionBuilder.withArgName("path").hasArg() - .withDescription("HTML file from twitter.com").create(HTML_OPTION)); - - CommandLine cmdline = null; - CommandLineParser parser = new GnuParser(); - try { - cmdline = parser.parse(options, args); - } catch (ParseException exp) { - System.err.println("Error parsing command line: " + exp.getMessage()); - System.exit(-1); - } - - if (!cmdline.hasOption(HTML_OPTION)) { - HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp(HTMLStatusExtractor.class.getName(), options); - System.exit(-1); - } - - String html_filename = cmdline.getOptionValue(HTML_OPTION); - BufferedReader html_file = null; - StringBuffer buf = new StringBuffer(); - try { - html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename))); - String line; - while ((line = html_file.readLine()) != null) { - buf.append(line); - buf.append('\n'); - } - } catch (IOException e) { - e.printStackTrace(); - } finally { - html_file.close(); - } - - HTMLStatusExtractor hse = new HTMLStatusExtractor(); - JsonObject json = hse.extractTweet(buf.toString()); - Gson gson = new GsonBuilder().setPrettyPrinting().create(); - System.out.println(gson.toJson(json)); + String html_filename = cmdline.getOptionValue(HTML_OPTION); + BufferedReader html_file = null; + StringBuffer buf = new StringBuffer(); + try { + html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename))); + String line; + while ((line = html_file.readLine()) != null) { + buf.append(line); + buf.append('\n'); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + html_file.close(); } + + HTMLStatusExtractor hse = new HTMLStatusExtractor(); + JsonObject json = hse.extractTweet(buf.toString()); + Gson gson = new GsonBuilder().setPrettyPrinting().create(); + System.out.println(gson.toJson(json)); + } } From 07b3b6433509c3afa8f3c87554f761632de4ba95 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Thu, 28 May 2015 17:10:59 -0600 Subject: [PATCH 2/9] Add a CBOR output format to the crawler. --- .../download/AsyncHTMLStatusBlockCrawler.java | 37 ++++++++++++++----- .../cc/twittertools/download/CBOROutput.java | 32 ++++++++++++++++ .../download/CrawlerOutputWriter.java | 8 ++++ .../twittertools/download/GZipJSONOutput.java | 33 +++++++++++++++++ 4 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java create mode 100644 twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java create mode 100644 twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java index da36389..430115f 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java @@ -67,6 +67,11 @@ public class AsyncHTMLStatusBlockCrawler { private static final int WAIT_BEFORE_RETRY = 1000; private static final Timer timer = new Timer(true); + enum Format { + JSON, + CBOR + } + private static final JsonParser JSON_PARSER = new JsonParser(); private static final Gson GSON = new Gson(); @@ -75,6 +80,7 @@ public class AsyncHTMLStatusBlockCrawler { private final File repair; private final AsyncHttpClient asyncHttpClient; private final boolean noFollow; + private final Format output_format; // key = statud id, value = tweet JSON private final ConcurrentSkipListMap crawl = new ConcurrentSkipListMap(); @@ -85,9 +91,10 @@ public class AsyncHTMLStatusBlockCrawler { private final AtomicInteger connections = new AtomicInteger(0); public AsyncHTMLStatusBlockCrawler(File file, String output, String repair, - boolean noFollow) throws IOException { + boolean noFollow, Format output_format) throws IOException { this.file = Preconditions.checkNotNull(file); this.noFollow = noFollow; + this.output_format = output_format; if (!file.exists()) { throw new IOException(file + " does not exist!"); @@ -125,7 +132,7 @@ public static String getUrl(long id, String username) { return String.format("http://twitter.com/%s/status/%d", username, id); } - public void fetch() throws IOException { + public void fetch() throws Exception { long start = System.currentTimeMillis(); LOG.info("Processing " + file); @@ -182,11 +189,16 @@ public void fetch() throws IOException { LOG.info("Writing tweets..."); int written = 0; - OutputStreamWriter out = new OutputStreamWriter(new GZIPOutputStream( - new FileOutputStream(output)), "UTF-8"); + CrawlerOutputWriter out; + if (output_format == Format.CBOR) { + out = new CBOROutput(output); + } else { + out = new GZipJSONOutput(output); + } + out.open(); for (Map.Entry entry : crawl.entrySet()) { written++; - out.write(entry.getValue() + "\n"); + out.write(entry.getValue()); } out.close(); @@ -195,12 +207,12 @@ public void fetch() throws IOException { if (this.repair != null) { LOG.info("Writing repair data file..."); written = 0; - out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8"); + OutputStreamWriter repair_out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8"); for (Map.Entry entry : crawl_repair.entrySet()) { written++; - out.write(entry.getValue() + "\n"); + repair_out.write(entry.getValue() + "\n"); } - out.close(); + repair_out.close(); LOG.info(written + " statuses need repair."); } @@ -375,6 +387,7 @@ private void crawlURL(String url, TweetFetcherHandler handler) { private static final String OUTPUT_OPTION = "output"; private static final String REPAIR_OPTION = "repair"; private static final String NOFOLLOW_OPTION = "noFollow"; + private static final String FORMAT_OPTION = "cbor"; @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { @@ -387,6 +400,7 @@ public static void main(String[] args) throws Exception { .withDescription("output repair file (can be used later as a data file)") .create(REPAIR_OPTION)); options.addOption(NOFOLLOW_OPTION, NOFOLLOW_OPTION, false, "don't follow 301 redirects"); + options.addOption(FORMAT_OPTION, FORMAT_OPTION, false, "use CBOR format"); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); @@ -407,6 +421,11 @@ public static void main(String[] args) throws Exception { String output = cmdline.getOptionValue(OUTPUT_OPTION); String repair = cmdline.getOptionValue(REPAIR_OPTION); boolean noFollow = cmdline.hasOption(NOFOLLOW_OPTION); - new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow).fetch(); + boolean cbor = cmdline.hasOption(FORMAT_OPTION); + Format output_format = Format.JSON; + if (cbor) { + output_format = Format.CBOR; + } + new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow, output_format).fetch(); } } diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java new file mode 100644 index 0000000..39e7c07 --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -0,0 +1,32 @@ +package cc.twittertools.download; + +import java.io.File; +import java.util.Map; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; + +public class CBOROutput extends GZipJSONOutput { + + CBORFactory fac; + ObjectMapper mapper; + + public CBOROutput(File output_file) { + super(output_file); + fac = new CBORFactory(); + mapper = new ObjectMapper(fac); + } + + @SuppressWarnings("unchecked") + public void write(String s) throws Exception { + Map obj; + try { + obj = mapper.readValue(s, Map.class); + } catch (Exception e) { + e.printStackTrace(); + return; + } + mapper.writeValue(out, obj); + } + +} diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java new file mode 100644 index 0000000..ebc119c --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java @@ -0,0 +1,8 @@ +package cc.twittertools.download; + +public interface CrawlerOutputWriter { + + void open() throws Exception; + void write(String s) throws Exception; + void close() throws Exception; +} diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java new file mode 100644 index 0000000..373ed4c --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java @@ -0,0 +1,33 @@ +package cc.twittertools.download; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; +import java.util.zip.GZIPOutputStream; + +public class GZipJSONOutput implements CrawlerOutputWriter { + + protected File output_file; + protected OutputStreamWriter out; + + public GZipJSONOutput(File output_file) { + this.output_file = output_file; + } + + @Override + public void open() throws Exception { + out = new OutputStreamWriter(new GZIPOutputStream( + new FileOutputStream(output_file)), "UTF-8"); + } + + @Override + public void write(String s) throws Exception { + out.write(s + "\n"); + } + + @Override + public void close() throws Exception { + out.close(); + } + +} From 300f052c4f87477748a16914b5613f5aba048ec6 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Mon, 1 Jun 2015 09:33:58 -0600 Subject: [PATCH 3/9] Fixed several bugs: read the JSON object with a JSON parser, not a CBOR parser; CBOR needs a byte target, so we need our own open/close; remind Jackson not to close the stream for us. Confirmed working on a set of 20 tweet ids, reading the output with other CBOR tools. --- .../cc/twittertools/download/CBOROutput.java | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java index 39e7c07..82a9409 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -1,32 +1,54 @@ package cc.twittertools.download; import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.util.Map; +import java.util.zip.GZIPOutputStream; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.cbor.CBORFactory; public class CBOROutput extends GZipJSONOutput { - CBORFactory fac; - ObjectMapper mapper; + CBORFactory cbor_fac; + ObjectMapper cbor_mapper; + JsonFactory json_fac; + ObjectMapper json_mapper; + OutputStream os; public CBOROutput(File output_file) { super(output_file); - fac = new CBORFactory(); - mapper = new ObjectMapper(fac); + cbor_fac = new CBORFactory(); + cbor_fac.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET); + cbor_mapper = new ObjectMapper(cbor_fac); + json_fac = new JsonFactory(); + json_mapper = new ObjectMapper(json_fac); } - @SuppressWarnings("unchecked") + @Override + public void open() throws Exception { + os = new GZIPOutputStream(new FileOutputStream(output_file)); + } + + @Override @SuppressWarnings("unchecked") public void write(String s) throws Exception { Map obj; try { - obj = mapper.readValue(s, Map.class); + obj = json_mapper.readValue(s, Map.class); } catch (Exception e) { e.printStackTrace(); return; } - mapper.writeValue(out, obj); + cbor_mapper.writeValue(os, obj); } + @Override + public void close() throws Exception { + os.close(); + } } From 709bd1d1975295fb5a73a5177130b56fa6741433 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:12:17 -0400 Subject: [PATCH 4/9] CBOR output following the CCA schema and confirmed readable. --- .../cc/twittertools/download/CBOROutput.java | 52 ++++++++++++++++--- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java index 82a9409..c0018c4 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -5,18 +5,27 @@ import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.util.LinkedHashMap; import java.util.Map; import java.util.zip.GZIPOutputStream; +import org.apache.commons.codec.digest.DigestUtils; + import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.dataformat.cbor.CBORFactory; +import com.fasterxml.jackson.dataformat.cbor.CBORGenerator; public class CBOROutput extends GZipJSONOutput { CBORFactory cbor_fac; ObjectMapper cbor_mapper; + CBORGenerator cbor_gen; JsonFactory json_fac; ObjectMapper json_mapper; OutputStream os; @@ -33,22 +42,49 @@ public CBOROutput(File output_file) { @Override public void open() throws Exception { os = new GZIPOutputStream(new FileOutputStream(output_file)); + cbor_gen = cbor_fac.createGenerator(os); } @Override @SuppressWarnings("unchecked") public void write(String s) throws Exception { - Map obj; - try { - obj = json_mapper.readValue(s, Map.class); - } catch (Exception e) { - e.printStackTrace(); - return; - } - cbor_mapper.writeValue(os, obj); + ObjectNode cca_obj = wrap_cca(s); + cbor_mapper.writeTree(cbor_gen, cca_obj); } @Override public void close() throws Exception { os.close(); } + + @SuppressWarnings("unchecked") + public ObjectNode wrap_cca(String tweet) throws JsonProcessingException, IOException { + final JsonNodeFactory jf = JsonNodeFactory.instance; + + ObjectNode twobj = (ObjectNode) json_mapper.readTree(tweet); + + ObjectNode cca_obj = jf.objectNode(); + long id = Long.parseLong(twobj.get("user").get("id").toString()); + String username = twobj.get("user").get("screen_name").toString(); + String url = AsyncHTMLStatusBlockCrawler.getUrl(id, username); + cca_obj.put("url", url); + cca_obj.put("timestamp", System.currentTimeMillis()); + cca_obj.putNull("request"); + + ObjectNode response = cca_obj.putObject("response"); + response.put("status", "200"); + response.put("server", "twitter.com"); + response.putArray("headers").add(jf.arrayNode().add("Content-Type").add("application/json")); + response.put("body", tweet); + + cca_obj.put("key", "com_twitter_" + DigestUtils.sha1Hex(url)); + + ArrayNode indices = cca_obj.arrayNode(); + indices.addObject().put("crawl", "tweets"); + cca_obj.set("indices", indices); + + ArrayNode features = cca_obj.arrayNode(); + features.addObject().replace("json", twobj); + cca_obj.set("features", features); + return cca_obj; + } } From 4e1fd5d54bf446fa3e0a155ab9e2c046efb1680c Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:22:17 -0400 Subject: [PATCH 5/9] Converted HTMLStatusExtractor to Jackson, since we have to use it for CBOR work elsewhere. --- .../corpus/data/HTMLStatusExtractor.java | 78 +++++++++---------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java index 0823eae..f4dd566 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java @@ -1,27 +1,17 @@ package cc.twittertools.corpus.data; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.Map; import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.net.URL; import java.net.URLDecoder; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.LinkedHashMap; +import java.util.Map; import java.util.TimeZone; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.JsonObject; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -29,13 +19,22 @@ import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; public class HTMLStatusExtractor { public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); + public JsonNodeFactory jfac; - public HTMLStatusExtractor() { + public HTMLStatusExtractor(JsonNodeFactory jfac) { date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); + this.jfac = jfac; } public static Map splitQuery(URL url) @@ -51,54 +50,52 @@ public static Map splitQuery(URL url) return query_pairs; } - public JsonObject extractTweet(String html) + public ObjectNode extractTweet(String html) throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { - JsonObject status = new JsonObject(); + ObjectNode status = jfac.objectNode(); Document doc = Jsoup.parse(html); Element tweet_div = doc.select("div.permalink-tweet").first(); String tweet_text = tweet_div.select("p.tweet-text").first().text(); - status.addProperty("text", tweet_text); + status.put("text", tweet_text); String tweet_id = tweet_div.attr("data-tweet-id"); - status.addProperty("id_str", tweet_id); - status.addProperty("id", Long.parseLong(tweet_id)); + status.put("id_str", tweet_id); + status.put("id", Long.parseLong(tweet_id)); String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); Date created_at = new Date(); created_at.setTime(Long.parseLong(timestamp) * 1000); - status.addProperty("created_at", date_fmt.format(created_at)); + status.put("created_at", date_fmt.format(created_at)); Elements js_stats_retweets = doc.select("li.js-stat-retweets"); if (!js_stats_retweets.isEmpty()) { - status.addProperty("retweeted", true); + status.put("retweeted", true); String count = js_stats_retweets.select("strong").first().text(); - status.addProperty("retweet_count", Long.parseLong(count)); + status.put("retweet_count", Long.parseLong(count)); } else { - status.addProperty("retweeted", false); - status.addProperty("retweet_count", 0); + status.put("retweeted", false); + status.put("retweet_count", 0); } Elements js_stats_favs = doc.select("li.js-stat-favorites"); - status.addProperty("favorited", !js_stats_favs.isEmpty()); + status.put("favorited", !js_stats_favs.isEmpty()); // User subfield - JsonObject user = new JsonObject(); + ObjectNode user = status.putObject("user"); String user_id = tweet_div.attr("data-user-id"); - user.addProperty("id_str", user_id); - user.addProperty("id", Long.parseLong(user_id)); + user.put("id_str", user_id); + user.put("id", Long.parseLong(user_id)); String screen_name = tweet_div.attr("data-screen-name"); - user.addProperty("screen_name", screen_name); + user.put("screen_name", screen_name); String user_name = tweet_div.attr("data-name"); - user.addProperty("name", user_name); - - status.add("user", user); + user.put("name", user_name); // Geo information Elements tweet_loc = doc.select("a.tweet-geo-text"); if (!tweet_loc.isEmpty()) { - JsonObject location = new JsonObject(); + ObjectNode location = status.putObject("location"); Element loc = tweet_loc.first(); // Adding http to avoid malformed URL exception URL url = new URL("http:" + loc.attr("href")); @@ -113,11 +110,10 @@ public JsonObject extractTweet(String html) String[] coordinates = lat_and_long.split(","); double latitude = Double.parseDouble(coordinates[0]); double longitude = Double.parseDouble(coordinates[1]); - location.addProperty("latitude", latitude); - location.addProperty("longitude", longitude); + location.put("latitude", latitude); + location.put("longitude", longitude); } - location.addProperty("location_text", loc.text()); - status.add("location", location); + location.put("location_text", loc.text()); } return status; @@ -162,9 +158,9 @@ public static void main(String[] args) throws Exception { html_file.close(); } - HTMLStatusExtractor hse = new HTMLStatusExtractor(); - JsonObject json = hse.extractTweet(buf.toString()); - Gson gson = new GsonBuilder().setPrettyPrinting().create(); - System.out.println(gson.toJson(json)); + JsonNodeFactory fac = JsonNodeFactory.instance; + HTMLStatusExtractor hse = new HTMLStatusExtractor(fac); + ObjectNode json = hse.extractTweet(buf.toString()); + System.out.println(json); } } From 67775e90d855502d6ba225d48d0334d1b88bc0d2 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:32:31 -0400 Subject: [PATCH 6/9] Pushed JSON extraction down into output format, so the CBOR format can output both the HTML and the JSON for completeness. --- .../corpus/data/HTMLStatusExtractor.java | 7 +++---- .../download/AsyncHTMLStatusBlockCrawler.java | 8 ++++---- .../cc/twittertools/download/CBOROutput.java | 17 ++++++++--------- .../twittertools/download/GZipJSONOutput.java | 9 ++++++++- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java index f4dd566..2ed3bd2 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java @@ -32,9 +32,9 @@ public class HTMLStatusExtractor { public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); public JsonNodeFactory jfac; - public HTMLStatusExtractor(JsonNodeFactory jfac) { + public HTMLStatusExtractor() { date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); - this.jfac = jfac; + this.jfac = JsonNodeFactory.instance; } public static Map splitQuery(URL url) @@ -158,8 +158,7 @@ public static void main(String[] args) throws Exception { html_file.close(); } - JsonNodeFactory fac = JsonNodeFactory.instance; - HTMLStatusExtractor hse = new HTMLStatusExtractor(fac); + HTMLStatusExtractor hse = new HTMLStatusExtractor(); ObjectNode json = hse.extractTweet(buf.toString()); System.out.println(json); } diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java index 430115f..b9a0c13 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java @@ -230,7 +230,7 @@ private class TweetFetcherHandler extends AsyncCompletionHandler { private int httpStatus = -1; - private HTMLStatusExtractor extractor = new HTMLStatusExtractor(); + // private HTMLStatusExtractor extractor = new HTMLStatusExtractor(); public TweetFetcherHandler(long id, String username, String url, int numRetries, boolean followRedirects, String line) { @@ -310,12 +310,12 @@ public Response onCompleted(Response response) { try { String html = response.getResponseBody("UTF-8"); - JsonObject status = extractor.extractTweet(html); + // JsonObject status = extractor.extractTweet(html); // save the requested id - status.addProperty("requested_id", new Long(id)); + // status.addProperty("requested_id", new Long(id)); - crawl.put(id, GSON.toJson(status)); + crawl.put(id, html); connections.decrementAndGet(); return response; diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java index c0018c4..d33f039 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -46,8 +46,9 @@ public void open() throws Exception { } @Override @SuppressWarnings("unchecked") - public void write(String s) throws Exception { - ObjectNode cca_obj = wrap_cca(s); + public void write(String html) throws Exception { + ObjectNode json_tweet = ext.extractTweet(html); + ObjectNode cca_obj = wrap_cca(html, json_tweet); cbor_mapper.writeTree(cbor_gen, cca_obj); } @@ -57,14 +58,12 @@ public void close() throws Exception { } @SuppressWarnings("unchecked") - public ObjectNode wrap_cca(String tweet) throws JsonProcessingException, IOException { + public ObjectNode wrap_cca(String tweet, ObjectNode json) throws JsonProcessingException, IOException { final JsonNodeFactory jf = JsonNodeFactory.instance; - ObjectNode twobj = (ObjectNode) json_mapper.readTree(tweet); - ObjectNode cca_obj = jf.objectNode(); - long id = Long.parseLong(twobj.get("user").get("id").toString()); - String username = twobj.get("user").get("screen_name").toString(); + long id = Long.parseLong(json.get("user").get("id").toString()); + String username = json.get("user").get("screen_name").toString(); String url = AsyncHTMLStatusBlockCrawler.getUrl(id, username); cca_obj.put("url", url); cca_obj.put("timestamp", System.currentTimeMillis()); @@ -73,7 +72,7 @@ public ObjectNode wrap_cca(String tweet) throws JsonProcessingException, IOExcep ObjectNode response = cca_obj.putObject("response"); response.put("status", "200"); response.put("server", "twitter.com"); - response.putArray("headers").add(jf.arrayNode().add("Content-Type").add("application/json")); + response.putArray("headers").add(jf.arrayNode().add("Content-Type").add("text/html")); response.put("body", tweet); cca_obj.put("key", "com_twitter_" + DigestUtils.sha1Hex(url)); @@ -83,7 +82,7 @@ public ObjectNode wrap_cca(String tweet) throws JsonProcessingException, IOExcep cca_obj.set("indices", indices); ArrayNode features = cca_obj.arrayNode(); - features.addObject().replace("json", twobj); + features.addObject().replace("json", json); cca_obj.set("features", features); return cca_obj; } diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java index 373ed4c..b2e733e 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java @@ -5,13 +5,19 @@ import java.io.OutputStreamWriter; import java.util.zip.GZIPOutputStream; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import cc.twittertools.corpus.data.HTMLStatusExtractor; + public class GZipJSONOutput implements CrawlerOutputWriter { protected File output_file; protected OutputStreamWriter out; + protected HTMLStatusExtractor ext; public GZipJSONOutput(File output_file) { this.output_file = output_file; + ext = new HTMLStatusExtractor(); } @Override @@ -22,7 +28,8 @@ public void open() throws Exception { @Override public void write(String s) throws Exception { - out.write(s + "\n"); + ObjectNode json = ext.extractTweet(s); + out.write(json.toString() + "\n"); } @Override From 940936c397ffd3d925f5b53c3b0f7f2c00be43aa Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:45:07 -0400 Subject: [PATCH 7/9] Fix URL entry --- .../src/main/java/cc/twittertools/download/CBOROutput.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java index d33f039..876f105 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -62,8 +62,8 @@ public ObjectNode wrap_cca(String tweet, ObjectNode json) throws JsonProcessingE final JsonNodeFactory jf = JsonNodeFactory.instance; ObjectNode cca_obj = jf.objectNode(); - long id = Long.parseLong(json.get("user").get("id").toString()); - String username = json.get("user").get("screen_name").toString(); + long id = json.get("id").asLong(); + String username = json.get("user").get("screen_name").textValue(); String url = AsyncHTMLStatusBlockCrawler.getUrl(id, username); cca_obj.put("url", url); cca_obj.put("timestamp", System.currentTimeMillis()); From b3dfd914f75e9aa0308aa13a8e01071725971cd5 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:46:34 -0400 Subject: [PATCH 8/9] Clean up imports and remove stray Gson stuff. --- .../download/AsyncHTMLStatusBlockCrawler.java | 16 ---------------- .../cc/twittertools/download/CBOROutput.java | 3 --- 2 files changed, 19 deletions(-) diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java index b9a0c13..e61ba5c 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java @@ -16,8 +16,6 @@ package cc.twittertools.download; -import cc.twittertools.corpus.data.HTMLStatusExtractor; - import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -30,8 +28,6 @@ import java.util.TimerTask; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicInteger; -import java.util.zip.GZIPOutputStream; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -39,14 +35,9 @@ import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; -import com.google.gson.Gson; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import com.google.gson.JsonSyntaxException; import com.ning.http.client.AsyncCompletionHandler; import com.ning.http.client.AsyncHttpClient; import com.ning.http.client.AsyncHttpClientConfig; @@ -72,9 +63,6 @@ enum Format { CBOR } - private static final JsonParser JSON_PARSER = new JsonParser(); - private static final Gson GSON = new Gson(); - private final File file; private final File output; private final File repair; @@ -323,10 +311,6 @@ public Response onCompleted(Response response) { LOG.warn("Error (" + e + "): " + url); retry(); return response; - } catch (JsonSyntaxException e) { - LOG.warn("Unable to parse embedded JSON: " + url); - retry(); - return response; } catch (NullPointerException e) { LOG.warn("Unexpected format for embedded JSON: " + url); retry(); diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java index 876f105..8143ccd 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -4,9 +4,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.util.LinkedHashMap; -import java.util.Map; import java.util.zip.GZIPOutputStream; import org.apache.commons.codec.digest.DigestUtils; From ba10021aa3c169f172142e74f4a42b0692007645 Mon Sep 17 00:00:00 2001 From: Ian Soboroff Date: Tue, 2 Jun 2015 10:49:03 -0400 Subject: [PATCH 9/9] Forgot the POM edits for Jackson, d'oh. --- twitter-tools-core/pom.xml | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/twitter-tools-core/pom.xml b/twitter-tools-core/pom.xml index 83b7436..05f21b2 100644 --- a/twitter-tools-core/pom.xml +++ b/twitter-tools-core/pom.xml @@ -48,7 +48,8 @@ UTF-8 UTF-8 - + 2.5.0 + @@ -198,6 +199,11 @@ commons-cli 1.2 + + commons-codec + commons-codec + 1.10 + commons-io commons-io @@ -265,6 +271,28 @@ jsoup 1.7.3 + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + com.fasterxml.jackson.dataformat + jackson-dataformat-cbor + 2.5.3 + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} +