diff --git a/twitter-tools-core/pom.xml b/twitter-tools-core/pom.xml index 83b7436..05f21b2 100644 --- a/twitter-tools-core/pom.xml +++ b/twitter-tools-core/pom.xml @@ -48,7 +48,8 @@ UTF-8 UTF-8 - + 2.5.0 + @@ -198,6 +199,11 @@ commons-cli 1.2 + + commons-codec + commons-codec + 1.10 + commons-io commons-io @@ -265,6 +271,28 @@ jsoup 1.7.3 + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + com.fasterxml.jackson.dataformat + jackson-dataformat-cbor + 2.5.3 + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + diff --git a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java index 5c7bf81..2ed3bd2 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java @@ -1,27 +1,17 @@ package cc.twittertools.corpus.data; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.Map; import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.net.URL; import java.net.URLDecoder; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.LinkedHashMap; +import java.util.Map; import java.util.TimeZone; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.JsonObject; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -29,142 +19,147 @@ import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; -public class HTMLStatusExtractor { +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; - public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); +public class HTMLStatusExtractor { - public HTMLStatusExtractor() { - date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); + public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy"); + public JsonNodeFactory jfac; + + public HTMLStatusExtractor() { + date_fmt.setTimeZone(TimeZone.getTimeZone("UTC")); + this.jfac = JsonNodeFactory.instance; + } + + public static Map splitQuery(URL url) + throws java.io.UnsupportedEncodingException { + Map query_pairs = new LinkedHashMap(); + String query = url.getQuery(); + String[] pairs = query.split("&"); + for (String pair : pairs) { + int idx = pair.indexOf("="); + query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), + URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); } + return query_pairs; + } + + public ObjectNode extractTweet(String html) + throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { + ObjectNode status = jfac.objectNode(); + + Document doc = Jsoup.parse(html); + Element tweet_div = doc.select("div.permalink-tweet").first(); + + String tweet_text = tweet_div.select("p.tweet-text").first().text(); + status.put("text", tweet_text); + + String tweet_id = tweet_div.attr("data-tweet-id"); + status.put("id_str", tweet_id); + status.put("id", Long.parseLong(tweet_id)); + + String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); + Date created_at = new Date(); + created_at.setTime(Long.parseLong(timestamp) * 1000); + status.put("created_at", date_fmt.format(created_at)); + + Elements js_stats_retweets = doc.select("li.js-stat-retweets"); + if (!js_stats_retweets.isEmpty()) { + status.put("retweeted", true); + String count = js_stats_retweets.select("strong").first().text(); + status.put("retweet_count", Long.parseLong(count)); + } else { + status.put("retweeted", false); + status.put("retweet_count", 0); + } + Elements js_stats_favs = doc.select("li.js-stat-favorites"); + status.put("favorited", !js_stats_favs.isEmpty()); + + + // User subfield + ObjectNode user = status.putObject("user"); + String user_id = tweet_div.attr("data-user-id"); + user.put("id_str", user_id); + user.put("id", Long.parseLong(user_id)); + String screen_name = tweet_div.attr("data-screen-name"); + user.put("screen_name", screen_name); + String user_name = tweet_div.attr("data-name"); + user.put("name", user_name); + + // Geo information + Elements tweet_loc = doc.select("a.tweet-geo-text"); + if (!tweet_loc.isEmpty()) { + ObjectNode location = status.putObject("location"); + Element loc = tweet_loc.first(); + // Adding http to avoid malformed URL exception + URL url = new URL("http:" + loc.attr("href")); + Map query_params = HTMLStatusExtractor.splitQuery(url); + // Loop over possible query parameters + // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html + String lat_and_long = null; + if ((lat_and_long = query_params.get("ll")) != null + || (lat_and_long = query_params.get("sll")) != null + || (lat_and_long = query_params.get("cbll")) != null + || (lat_and_long = query_params.get("q")) != null) { + String[] coordinates = lat_and_long.split(","); + double latitude = Double.parseDouble(coordinates[0]); + double longitude = Double.parseDouble(coordinates[1]); + location.put("latitude", latitude); + location.put("longitude", longitude); + } + location.put("location_text", loc.text()); + } + + return status; + } - public static Map splitQuery(URL url) - throws java.io.UnsupportedEncodingException { - Map query_pairs = new LinkedHashMap(); - String query = url.getQuery(); - String[] pairs = query.split("&"); - for (String pair : pairs) { - int idx = pair.indexOf("="); - query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), - URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); - } - return query_pairs; + private static final String HTML_OPTION = "html"; + + @SuppressWarnings("static-access") + public static void main(String[] args) throws Exception { + Options options = new Options(); + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("HTML file from twitter.com").create(HTML_OPTION)); + + CommandLine cmdline = null; + CommandLineParser parser = new GnuParser(); + try { + cmdline = parser.parse(options, args); + } catch (ParseException exp) { + System.err.println("Error parsing command line: " + exp.getMessage()); + System.exit(-1); } - public JsonObject extractTweet(String html) - throws java.net.MalformedURLException, java.io.UnsupportedEncodingException { - JsonObject status = new JsonObject(); - - Document doc = Jsoup.parse(html); - Element tweet_div = doc.select("div.permalink-tweet").first(); - - String tweet_text = tweet_div.select("p.tweet-text").first().text(); - status.addProperty("text", tweet_text); - - String tweet_id = tweet_div.attr("data-tweet-id"); - status.addProperty("id_str", tweet_id); - status.addProperty("id", Long.parseLong(tweet_id)); - - String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time"); - Date created_at = new Date(); - created_at.setTime(Long.parseLong(timestamp) * 1000); - status.addProperty("created_at", date_fmt.format(created_at)); - - Elements js_stats_retweets = doc.select("li.js-stat-retweets"); - if (!js_stats_retweets.isEmpty()) { - status.addProperty("retweeted", true); - String count = js_stats_retweets.select("strong").first().text(); - status.addProperty("retweet_count", Long.parseLong(count)); - } else { - status.addProperty("retweeted", false); - status.addProperty("retweet_count", 0); - } - Elements js_stats_favs = doc.select("li.js-stat-favorites"); - status.addProperty("favorited", !js_stats_favs.isEmpty()); - - - // User subfield - JsonObject user = new JsonObject(); - String user_id = tweet_div.attr("data-user-id"); - user.addProperty("id_str", user_id); - user.addProperty("id", Long.parseLong(user_id)); - String screen_name = tweet_div.attr("data-screen-name"); - user.addProperty("screen_name", screen_name); - String user_name = tweet_div.attr("data-name"); - user.addProperty("name", user_name); - - status.add("user", user); - - // Geo information - Elements tweet_loc = doc.select("a.tweet-geo-text"); - if (!tweet_loc.isEmpty()) { - JsonObject location = new JsonObject(); - Element loc = tweet_loc.first(); - // Adding http to avoid malformed URL exception - URL url = new URL("http:" + loc.attr("href")); - Map query_params = HTMLStatusExtractor.splitQuery(url); - // Loop over possible query parameters - // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html - String lat_and_long = null; - if ((lat_and_long = query_params.get("ll")) != null - || (lat_and_long = query_params.get("sll")) != null - || (lat_and_long = query_params.get("cbll")) != null - || (lat_and_long = query_params.get("q")) != null) { - String[] coordinates = lat_and_long.split(","); - double latitude = Double.parseDouble(coordinates[0]); - double longitude = Double.parseDouble(coordinates[1]); - location.addProperty("latitude", latitude); - location.addProperty("longitude", longitude); - } - location.addProperty("location_text", loc.text()); - status.add("location", location); - } - - return status; + if (!cmdline.hasOption(HTML_OPTION)) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(HTMLStatusExtractor.class.getName(), options); + System.exit(-1); } - private static final String HTML_OPTION = "html"; - - @SuppressWarnings("static-access") - public static void main(String[] args) throws Exception { - Options options = new Options(); - options.addOption(OptionBuilder.withArgName("path").hasArg() - .withDescription("HTML file from twitter.com").create(HTML_OPTION)); - - CommandLine cmdline = null; - CommandLineParser parser = new GnuParser(); - try { - cmdline = parser.parse(options, args); - } catch (ParseException exp) { - System.err.println("Error parsing command line: " + exp.getMessage()); - System.exit(-1); - } - - if (!cmdline.hasOption(HTML_OPTION)) { - HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp(HTMLStatusExtractor.class.getName(), options); - System.exit(-1); - } - - String html_filename = cmdline.getOptionValue(HTML_OPTION); - BufferedReader html_file = null; - StringBuffer buf = new StringBuffer(); - try { - html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename))); - String line; - while ((line = html_file.readLine()) != null) { - buf.append(line); - buf.append('\n'); - } - } catch (IOException e) { - e.printStackTrace(); - } finally { - html_file.close(); - } - - HTMLStatusExtractor hse = new HTMLStatusExtractor(); - JsonObject json = hse.extractTweet(buf.toString()); - Gson gson = new GsonBuilder().setPrettyPrinting().create(); - System.out.println(gson.toJson(json)); + String html_filename = cmdline.getOptionValue(HTML_OPTION); + BufferedReader html_file = null; + StringBuffer buf = new StringBuffer(); + try { + html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename))); + String line; + while ((line = html_file.readLine()) != null) { + buf.append(line); + buf.append('\n'); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + html_file.close(); } + + HTMLStatusExtractor hse = new HTMLStatusExtractor(); + ObjectNode json = hse.extractTweet(buf.toString()); + System.out.println(json); + } } diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java index da36389..e61ba5c 100644 --- a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java @@ -16,8 +16,6 @@ package cc.twittertools.download; -import cc.twittertools.corpus.data.HTMLStatusExtractor; - import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -30,8 +28,6 @@ import java.util.TimerTask; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicInteger; -import java.util.zip.GZIPOutputStream; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -39,14 +35,9 @@ import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; -import com.google.gson.Gson; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import com.google.gson.JsonSyntaxException; import com.ning.http.client.AsyncCompletionHandler; import com.ning.http.client.AsyncHttpClient; import com.ning.http.client.AsyncHttpClientConfig; @@ -67,14 +58,17 @@ public class AsyncHTMLStatusBlockCrawler { private static final int WAIT_BEFORE_RETRY = 1000; private static final Timer timer = new Timer(true); - private static final JsonParser JSON_PARSER = new JsonParser(); - private static final Gson GSON = new Gson(); - + enum Format { + JSON, + CBOR + } + private final File file; private final File output; private final File repair; private final AsyncHttpClient asyncHttpClient; private final boolean noFollow; + private final Format output_format; // key = statud id, value = tweet JSON private final ConcurrentSkipListMap crawl = new ConcurrentSkipListMap(); @@ -85,9 +79,10 @@ public class AsyncHTMLStatusBlockCrawler { private final AtomicInteger connections = new AtomicInteger(0); public AsyncHTMLStatusBlockCrawler(File file, String output, String repair, - boolean noFollow) throws IOException { + boolean noFollow, Format output_format) throws IOException { this.file = Preconditions.checkNotNull(file); this.noFollow = noFollow; + this.output_format = output_format; if (!file.exists()) { throw new IOException(file + " does not exist!"); @@ -125,7 +120,7 @@ public static String getUrl(long id, String username) { return String.format("http://twitter.com/%s/status/%d", username, id); } - public void fetch() throws IOException { + public void fetch() throws Exception { long start = System.currentTimeMillis(); LOG.info("Processing " + file); @@ -182,11 +177,16 @@ public void fetch() throws IOException { LOG.info("Writing tweets..."); int written = 0; - OutputStreamWriter out = new OutputStreamWriter(new GZIPOutputStream( - new FileOutputStream(output)), "UTF-8"); + CrawlerOutputWriter out; + if (output_format == Format.CBOR) { + out = new CBOROutput(output); + } else { + out = new GZipJSONOutput(output); + } + out.open(); for (Map.Entry entry : crawl.entrySet()) { written++; - out.write(entry.getValue() + "\n"); + out.write(entry.getValue()); } out.close(); @@ -195,12 +195,12 @@ public void fetch() throws IOException { if (this.repair != null) { LOG.info("Writing repair data file..."); written = 0; - out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8"); + OutputStreamWriter repair_out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8"); for (Map.Entry entry : crawl_repair.entrySet()) { written++; - out.write(entry.getValue() + "\n"); + repair_out.write(entry.getValue() + "\n"); } - out.close(); + repair_out.close(); LOG.info(written + " statuses need repair."); } @@ -218,7 +218,7 @@ private class TweetFetcherHandler extends AsyncCompletionHandler { private int httpStatus = -1; - private HTMLStatusExtractor extractor = new HTMLStatusExtractor(); + // private HTMLStatusExtractor extractor = new HTMLStatusExtractor(); public TweetFetcherHandler(long id, String username, String url, int numRetries, boolean followRedirects, String line) { @@ -298,12 +298,12 @@ public Response onCompleted(Response response) { try { String html = response.getResponseBody("UTF-8"); - JsonObject status = extractor.extractTweet(html); + // JsonObject status = extractor.extractTweet(html); // save the requested id - status.addProperty("requested_id", new Long(id)); + // status.addProperty("requested_id", new Long(id)); - crawl.put(id, GSON.toJson(status)); + crawl.put(id, html); connections.decrementAndGet(); return response; @@ -311,10 +311,6 @@ public Response onCompleted(Response response) { LOG.warn("Error (" + e + "): " + url); retry(); return response; - } catch (JsonSyntaxException e) { - LOG.warn("Unable to parse embedded JSON: " + url); - retry(); - return response; } catch (NullPointerException e) { LOG.warn("Unexpected format for embedded JSON: " + url); retry(); @@ -375,6 +371,7 @@ private void crawlURL(String url, TweetFetcherHandler handler) { private static final String OUTPUT_OPTION = "output"; private static final String REPAIR_OPTION = "repair"; private static final String NOFOLLOW_OPTION = "noFollow"; + private static final String FORMAT_OPTION = "cbor"; @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { @@ -387,6 +384,7 @@ public static void main(String[] args) throws Exception { .withDescription("output repair file (can be used later as a data file)") .create(REPAIR_OPTION)); options.addOption(NOFOLLOW_OPTION, NOFOLLOW_OPTION, false, "don't follow 301 redirects"); + options.addOption(FORMAT_OPTION, FORMAT_OPTION, false, "use CBOR format"); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); @@ -407,6 +405,11 @@ public static void main(String[] args) throws Exception { String output = cmdline.getOptionValue(OUTPUT_OPTION); String repair = cmdline.getOptionValue(REPAIR_OPTION); boolean noFollow = cmdline.hasOption(NOFOLLOW_OPTION); - new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow).fetch(); + boolean cbor = cmdline.hasOption(FORMAT_OPTION); + Format output_format = Format.JSON; + if (cbor) { + output_format = Format.CBOR; + } + new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow, output_format).fetch(); } } diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java new file mode 100644 index 0000000..8143ccd --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java @@ -0,0 +1,86 @@ +package cc.twittertools.download; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.codec.digest.DigestUtils; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; +import com.fasterxml.jackson.dataformat.cbor.CBORGenerator; + +public class CBOROutput extends GZipJSONOutput { + + CBORFactory cbor_fac; + ObjectMapper cbor_mapper; + CBORGenerator cbor_gen; + JsonFactory json_fac; + ObjectMapper json_mapper; + OutputStream os; + + public CBOROutput(File output_file) { + super(output_file); + cbor_fac = new CBORFactory(); + cbor_fac.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET); + cbor_mapper = new ObjectMapper(cbor_fac); + json_fac = new JsonFactory(); + json_mapper = new ObjectMapper(json_fac); + } + + @Override + public void open() throws Exception { + os = new GZIPOutputStream(new FileOutputStream(output_file)); + cbor_gen = cbor_fac.createGenerator(os); + } + + @Override @SuppressWarnings("unchecked") + public void write(String html) throws Exception { + ObjectNode json_tweet = ext.extractTweet(html); + ObjectNode cca_obj = wrap_cca(html, json_tweet); + cbor_mapper.writeTree(cbor_gen, cca_obj); + } + + @Override + public void close() throws Exception { + os.close(); + } + + @SuppressWarnings("unchecked") + public ObjectNode wrap_cca(String tweet, ObjectNode json) throws JsonProcessingException, IOException { + final JsonNodeFactory jf = JsonNodeFactory.instance; + + ObjectNode cca_obj = jf.objectNode(); + long id = json.get("id").asLong(); + String username = json.get("user").get("screen_name").textValue(); + String url = AsyncHTMLStatusBlockCrawler.getUrl(id, username); + cca_obj.put("url", url); + cca_obj.put("timestamp", System.currentTimeMillis()); + cca_obj.putNull("request"); + + ObjectNode response = cca_obj.putObject("response"); + response.put("status", "200"); + response.put("server", "twitter.com"); + response.putArray("headers").add(jf.arrayNode().add("Content-Type").add("text/html")); + response.put("body", tweet); + + cca_obj.put("key", "com_twitter_" + DigestUtils.sha1Hex(url)); + + ArrayNode indices = cca_obj.arrayNode(); + indices.addObject().put("crawl", "tweets"); + cca_obj.set("indices", indices); + + ArrayNode features = cca_obj.arrayNode(); + features.addObject().replace("json", json); + cca_obj.set("features", features); + return cca_obj; + } +} diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java new file mode 100644 index 0000000..ebc119c --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java @@ -0,0 +1,8 @@ +package cc.twittertools.download; + +public interface CrawlerOutputWriter { + + void open() throws Exception; + void write(String s) throws Exception; + void close() throws Exception; +} diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java new file mode 100644 index 0000000..b2e733e --- /dev/null +++ b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java @@ -0,0 +1,40 @@ +package cc.twittertools.download; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; +import java.util.zip.GZIPOutputStream; + +import com.fasterxml.jackson.databind.node.ObjectNode; + +import cc.twittertools.corpus.data.HTMLStatusExtractor; + +public class GZipJSONOutput implements CrawlerOutputWriter { + + protected File output_file; + protected OutputStreamWriter out; + protected HTMLStatusExtractor ext; + + public GZipJSONOutput(File output_file) { + this.output_file = output_file; + ext = new HTMLStatusExtractor(); + } + + @Override + public void open() throws Exception { + out = new OutputStreamWriter(new GZIPOutputStream( + new FileOutputStream(output_file)), "UTF-8"); + } + + @Override + public void write(String s) throws Exception { + ObjectNode json = ext.extractTweet(s); + out.write(json.toString() + "\n"); + } + + @Override + public void close() throws Exception { + out.close(); + } + +}