diff --git a/twitter-tools-core/pom.xml b/twitter-tools-core/pom.xml
index 83b7436..05f21b2 100644
--- a/twitter-tools-core/pom.xml
+++ b/twitter-tools-core/pom.xml
@@ -48,7 +48,8 @@
UTF-8
UTF-8
-
+ 2.5.0
+
@@ -198,6 +199,11 @@
commons-cli
1.2
+
+ commons-codec
+ commons-codec
+ 1.10
+
commons-io
commons-io
@@ -265,6 +271,28 @@
jsoup
1.7.3
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ ${jackson.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ ${jackson.version}
+
+
+ com.fasterxml.jackson.dataformat
+ jackson-dataformat-cbor
+ 2.5.3
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
diff --git a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java
index 5c7bf81..2ed3bd2 100644
--- a/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java
+++ b/twitter-tools-core/src/main/java/cc/twittertools/corpus/data/HTMLStatusExtractor.java
@@ -1,27 +1,17 @@
package cc.twittertools.corpus.data;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.Map;
import java.io.BufferedReader;
-import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.Date;
+import java.util.LinkedHashMap;
+import java.util.Map;
import java.util.TimeZone;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Document;
-import org.jsoup.select.Elements;
-
-import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
-import com.google.gson.JsonObject;
-
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
@@ -29,142 +19,147 @@
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
-public class HTMLStatusExtractor {
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
- public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy");
+public class HTMLStatusExtractor {
- public HTMLStatusExtractor() {
- date_fmt.setTimeZone(TimeZone.getTimeZone("UTC"));
+ public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy");
+ public JsonNodeFactory jfac;
+
+ public HTMLStatusExtractor() {
+ date_fmt.setTimeZone(TimeZone.getTimeZone("UTC"));
+ this.jfac = JsonNodeFactory.instance;
+ }
+
+ public static Map splitQuery(URL url)
+ throws java.io.UnsupportedEncodingException {
+ Map query_pairs = new LinkedHashMap();
+ String query = url.getQuery();
+ String[] pairs = query.split("&");
+ for (String pair : pairs) {
+ int idx = pair.indexOf("=");
+ query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"),
+ URLDecoder.decode(pair.substring(idx + 1), "UTF-8"));
}
+ return query_pairs;
+ }
+
+ public ObjectNode extractTweet(String html)
+ throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
+ ObjectNode status = jfac.objectNode();
+
+ Document doc = Jsoup.parse(html);
+ Element tweet_div = doc.select("div.permalink-tweet").first();
+
+ String tweet_text = tweet_div.select("p.tweet-text").first().text();
+ status.put("text", tweet_text);
+
+ String tweet_id = tweet_div.attr("data-tweet-id");
+ status.put("id_str", tweet_id);
+ status.put("id", Long.parseLong(tweet_id));
+
+ String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
+ Date created_at = new Date();
+ created_at.setTime(Long.parseLong(timestamp) * 1000);
+ status.put("created_at", date_fmt.format(created_at));
+
+ Elements js_stats_retweets = doc.select("li.js-stat-retweets");
+ if (!js_stats_retweets.isEmpty()) {
+ status.put("retweeted", true);
+ String count = js_stats_retweets.select("strong").first().text();
+ status.put("retweet_count", Long.parseLong(count));
+ } else {
+ status.put("retweeted", false);
+ status.put("retweet_count", 0);
+ }
+ Elements js_stats_favs = doc.select("li.js-stat-favorites");
+ status.put("favorited", !js_stats_favs.isEmpty());
+
+
+ // User subfield
+ ObjectNode user = status.putObject("user");
+ String user_id = tweet_div.attr("data-user-id");
+ user.put("id_str", user_id);
+ user.put("id", Long.parseLong(user_id));
+ String screen_name = tweet_div.attr("data-screen-name");
+ user.put("screen_name", screen_name);
+ String user_name = tweet_div.attr("data-name");
+ user.put("name", user_name);
+
+ // Geo information
+ Elements tweet_loc = doc.select("a.tweet-geo-text");
+ if (!tweet_loc.isEmpty()) {
+ ObjectNode location = status.putObject("location");
+ Element loc = tweet_loc.first();
+ // Adding http to avoid malformed URL exception
+ URL url = new URL("http:" + loc.attr("href"));
+ Map query_params = HTMLStatusExtractor.splitQuery(url);
+ // Loop over possible query parameters
+ // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html
+ String lat_and_long = null;
+ if ((lat_and_long = query_params.get("ll")) != null
+ || (lat_and_long = query_params.get("sll")) != null
+ || (lat_and_long = query_params.get("cbll")) != null
+ || (lat_and_long = query_params.get("q")) != null) {
+ String[] coordinates = lat_and_long.split(",");
+ double latitude = Double.parseDouble(coordinates[0]);
+ double longitude = Double.parseDouble(coordinates[1]);
+ location.put("latitude", latitude);
+ location.put("longitude", longitude);
+ }
+ location.put("location_text", loc.text());
+ }
+
+ return status;
+ }
- public static Map splitQuery(URL url)
- throws java.io.UnsupportedEncodingException {
- Map query_pairs = new LinkedHashMap();
- String query = url.getQuery();
- String[] pairs = query.split("&");
- for (String pair : pairs) {
- int idx = pair.indexOf("=");
- query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"),
- URLDecoder.decode(pair.substring(idx + 1), "UTF-8"));
- }
- return query_pairs;
+ private static final String HTML_OPTION = "html";
+
+ @SuppressWarnings("static-access")
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ options.addOption(OptionBuilder.withArgName("path").hasArg()
+ .withDescription("HTML file from twitter.com").create(HTML_OPTION));
+
+ CommandLine cmdline = null;
+ CommandLineParser parser = new GnuParser();
+ try {
+ cmdline = parser.parse(options, args);
+ } catch (ParseException exp) {
+ System.err.println("Error parsing command line: " + exp.getMessage());
+ System.exit(-1);
}
- public JsonObject extractTweet(String html)
- throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
- JsonObject status = new JsonObject();
-
- Document doc = Jsoup.parse(html);
- Element tweet_div = doc.select("div.permalink-tweet").first();
-
- String tweet_text = tweet_div.select("p.tweet-text").first().text();
- status.addProperty("text", tweet_text);
-
- String tweet_id = tweet_div.attr("data-tweet-id");
- status.addProperty("id_str", tweet_id);
- status.addProperty("id", Long.parseLong(tweet_id));
-
- String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
- Date created_at = new Date();
- created_at.setTime(Long.parseLong(timestamp) * 1000);
- status.addProperty("created_at", date_fmt.format(created_at));
-
- Elements js_stats_retweets = doc.select("li.js-stat-retweets");
- if (!js_stats_retweets.isEmpty()) {
- status.addProperty("retweeted", true);
- String count = js_stats_retweets.select("strong").first().text();
- status.addProperty("retweet_count", Long.parseLong(count));
- } else {
- status.addProperty("retweeted", false);
- status.addProperty("retweet_count", 0);
- }
- Elements js_stats_favs = doc.select("li.js-stat-favorites");
- status.addProperty("favorited", !js_stats_favs.isEmpty());
-
-
- // User subfield
- JsonObject user = new JsonObject();
- String user_id = tweet_div.attr("data-user-id");
- user.addProperty("id_str", user_id);
- user.addProperty("id", Long.parseLong(user_id));
- String screen_name = tweet_div.attr("data-screen-name");
- user.addProperty("screen_name", screen_name);
- String user_name = tweet_div.attr("data-name");
- user.addProperty("name", user_name);
-
- status.add("user", user);
-
- // Geo information
- Elements tweet_loc = doc.select("a.tweet-geo-text");
- if (!tweet_loc.isEmpty()) {
- JsonObject location = new JsonObject();
- Element loc = tweet_loc.first();
- // Adding http to avoid malformed URL exception
- URL url = new URL("http:" + loc.attr("href"));
- Map query_params = HTMLStatusExtractor.splitQuery(url);
- // Loop over possible query parameters
- // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html
- String lat_and_long = null;
- if ((lat_and_long = query_params.get("ll")) != null
- || (lat_and_long = query_params.get("sll")) != null
- || (lat_and_long = query_params.get("cbll")) != null
- || (lat_and_long = query_params.get("q")) != null) {
- String[] coordinates = lat_and_long.split(",");
- double latitude = Double.parseDouble(coordinates[0]);
- double longitude = Double.parseDouble(coordinates[1]);
- location.addProperty("latitude", latitude);
- location.addProperty("longitude", longitude);
- }
- location.addProperty("location_text", loc.text());
- status.add("location", location);
- }
-
- return status;
+ if (!cmdline.hasOption(HTML_OPTION)) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(HTMLStatusExtractor.class.getName(), options);
+ System.exit(-1);
}
- private static final String HTML_OPTION = "html";
-
- @SuppressWarnings("static-access")
- public static void main(String[] args) throws Exception {
- Options options = new Options();
- options.addOption(OptionBuilder.withArgName("path").hasArg()
- .withDescription("HTML file from twitter.com").create(HTML_OPTION));
-
- CommandLine cmdline = null;
- CommandLineParser parser = new GnuParser();
- try {
- cmdline = parser.parse(options, args);
- } catch (ParseException exp) {
- System.err.println("Error parsing command line: " + exp.getMessage());
- System.exit(-1);
- }
-
- if (!cmdline.hasOption(HTML_OPTION)) {
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp(HTMLStatusExtractor.class.getName(), options);
- System.exit(-1);
- }
-
- String html_filename = cmdline.getOptionValue(HTML_OPTION);
- BufferedReader html_file = null;
- StringBuffer buf = new StringBuffer();
- try {
- html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename)));
- String line;
- while ((line = html_file.readLine()) != null) {
- buf.append(line);
- buf.append('\n');
- }
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- html_file.close();
- }
-
- HTMLStatusExtractor hse = new HTMLStatusExtractor();
- JsonObject json = hse.extractTweet(buf.toString());
- Gson gson = new GsonBuilder().setPrettyPrinting().create();
- System.out.println(gson.toJson(json));
+ String html_filename = cmdline.getOptionValue(HTML_OPTION);
+ BufferedReader html_file = null;
+ StringBuffer buf = new StringBuffer();
+ try {
+ html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename)));
+ String line;
+ while ((line = html_file.readLine()) != null) {
+ buf.append(line);
+ buf.append('\n');
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ html_file.close();
}
+
+ HTMLStatusExtractor hse = new HTMLStatusExtractor();
+ ObjectNode json = hse.extractTweet(buf.toString());
+ System.out.println(json);
+ }
}
diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java
index da36389..e61ba5c 100644
--- a/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java
+++ b/twitter-tools-core/src/main/java/cc/twittertools/download/AsyncHTMLStatusBlockCrawler.java
@@ -16,8 +16,6 @@
package cc.twittertools.download;
-import cc.twittertools.corpus.data.HTMLStatusExtractor;
-
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -30,8 +28,6 @@
import java.util.TimerTask;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
-import java.util.zip.GZIPOutputStream;
-
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
@@ -39,14 +35,9 @@
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
-import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import com.google.common.base.Preconditions;
-import com.google.gson.Gson;
-import com.google.gson.JsonObject;
-import com.google.gson.JsonParser;
-import com.google.gson.JsonSyntaxException;
import com.ning.http.client.AsyncCompletionHandler;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.AsyncHttpClientConfig;
@@ -67,14 +58,17 @@ public class AsyncHTMLStatusBlockCrawler {
private static final int WAIT_BEFORE_RETRY = 1000;
private static final Timer timer = new Timer(true);
- private static final JsonParser JSON_PARSER = new JsonParser();
- private static final Gson GSON = new Gson();
-
+ enum Format {
+ JSON,
+ CBOR
+ }
+
private final File file;
private final File output;
private final File repair;
private final AsyncHttpClient asyncHttpClient;
private final boolean noFollow;
+ private final Format output_format;
// key = statud id, value = tweet JSON
private final ConcurrentSkipListMap crawl = new ConcurrentSkipListMap();
@@ -85,9 +79,10 @@ public class AsyncHTMLStatusBlockCrawler {
private final AtomicInteger connections = new AtomicInteger(0);
public AsyncHTMLStatusBlockCrawler(File file, String output, String repair,
- boolean noFollow) throws IOException {
+ boolean noFollow, Format output_format) throws IOException {
this.file = Preconditions.checkNotNull(file);
this.noFollow = noFollow;
+ this.output_format = output_format;
if (!file.exists()) {
throw new IOException(file + " does not exist!");
@@ -125,7 +120,7 @@ public static String getUrl(long id, String username) {
return String.format("http://twitter.com/%s/status/%d", username, id);
}
- public void fetch() throws IOException {
+ public void fetch() throws Exception {
long start = System.currentTimeMillis();
LOG.info("Processing " + file);
@@ -182,11 +177,16 @@ public void fetch() throws IOException {
LOG.info("Writing tweets...");
int written = 0;
- OutputStreamWriter out = new OutputStreamWriter(new GZIPOutputStream(
- new FileOutputStream(output)), "UTF-8");
+ CrawlerOutputWriter out;
+ if (output_format == Format.CBOR) {
+ out = new CBOROutput(output);
+ } else {
+ out = new GZipJSONOutput(output);
+ }
+ out.open();
for (Map.Entry entry : crawl.entrySet()) {
written++;
- out.write(entry.getValue() + "\n");
+ out.write(entry.getValue());
}
out.close();
@@ -195,12 +195,12 @@ public void fetch() throws IOException {
if (this.repair != null) {
LOG.info("Writing repair data file...");
written = 0;
- out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8");
+ OutputStreamWriter repair_out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8");
for (Map.Entry entry : crawl_repair.entrySet()) {
written++;
- out.write(entry.getValue() + "\n");
+ repair_out.write(entry.getValue() + "\n");
}
- out.close();
+ repair_out.close();
LOG.info(written + " statuses need repair.");
}
@@ -218,7 +218,7 @@ private class TweetFetcherHandler extends AsyncCompletionHandler {
private int httpStatus = -1;
- private HTMLStatusExtractor extractor = new HTMLStatusExtractor();
+ // private HTMLStatusExtractor extractor = new HTMLStatusExtractor();
public TweetFetcherHandler(long id, String username, String url, int numRetries,
boolean followRedirects, String line) {
@@ -298,12 +298,12 @@ public Response onCompleted(Response response) {
try {
String html = response.getResponseBody("UTF-8");
- JsonObject status = extractor.extractTweet(html);
+ // JsonObject status = extractor.extractTweet(html);
// save the requested id
- status.addProperty("requested_id", new Long(id));
+ // status.addProperty("requested_id", new Long(id));
- crawl.put(id, GSON.toJson(status));
+ crawl.put(id, html);
connections.decrementAndGet();
return response;
@@ -311,10 +311,6 @@ public Response onCompleted(Response response) {
LOG.warn("Error (" + e + "): " + url);
retry();
return response;
- } catch (JsonSyntaxException e) {
- LOG.warn("Unable to parse embedded JSON: " + url);
- retry();
- return response;
} catch (NullPointerException e) {
LOG.warn("Unexpected format for embedded JSON: " + url);
retry();
@@ -375,6 +371,7 @@ private void crawlURL(String url, TweetFetcherHandler handler) {
private static final String OUTPUT_OPTION = "output";
private static final String REPAIR_OPTION = "repair";
private static final String NOFOLLOW_OPTION = "noFollow";
+ private static final String FORMAT_OPTION = "cbor";
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
@@ -387,6 +384,7 @@ public static void main(String[] args) throws Exception {
.withDescription("output repair file (can be used later as a data file)")
.create(REPAIR_OPTION));
options.addOption(NOFOLLOW_OPTION, NOFOLLOW_OPTION, false, "don't follow 301 redirects");
+ options.addOption(FORMAT_OPTION, FORMAT_OPTION, false, "use CBOR format");
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
@@ -407,6 +405,11 @@ public static void main(String[] args) throws Exception {
String output = cmdline.getOptionValue(OUTPUT_OPTION);
String repair = cmdline.getOptionValue(REPAIR_OPTION);
boolean noFollow = cmdline.hasOption(NOFOLLOW_OPTION);
- new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow).fetch();
+ boolean cbor = cmdline.hasOption(FORMAT_OPTION);
+ Format output_format = Format.JSON;
+ if (cbor) {
+ output_format = Format.CBOR;
+ }
+ new AsyncHTMLStatusBlockCrawler(new File(data), output, repair, noFollow, output_format).fetch();
}
}
diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java
new file mode 100644
index 0000000..8143ccd
--- /dev/null
+++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CBOROutput.java
@@ -0,0 +1,86 @@
+package cc.twittertools.download;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.commons.codec.digest.DigestUtils;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+
+public class CBOROutput extends GZipJSONOutput {
+
+ CBORFactory cbor_fac;
+ ObjectMapper cbor_mapper;
+ CBORGenerator cbor_gen;
+ JsonFactory json_fac;
+ ObjectMapper json_mapper;
+ OutputStream os;
+
+ public CBOROutput(File output_file) {
+ super(output_file);
+ cbor_fac = new CBORFactory();
+ cbor_fac.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
+ cbor_mapper = new ObjectMapper(cbor_fac);
+ json_fac = new JsonFactory();
+ json_mapper = new ObjectMapper(json_fac);
+ }
+
+ @Override
+ public void open() throws Exception {
+ os = new GZIPOutputStream(new FileOutputStream(output_file));
+ cbor_gen = cbor_fac.createGenerator(os);
+ }
+
+ @Override @SuppressWarnings("unchecked")
+ public void write(String html) throws Exception {
+ ObjectNode json_tweet = ext.extractTweet(html);
+ ObjectNode cca_obj = wrap_cca(html, json_tweet);
+ cbor_mapper.writeTree(cbor_gen, cca_obj);
+ }
+
+ @Override
+ public void close() throws Exception {
+ os.close();
+ }
+
+ @SuppressWarnings("unchecked")
+ public ObjectNode wrap_cca(String tweet, ObjectNode json) throws JsonProcessingException, IOException {
+ final JsonNodeFactory jf = JsonNodeFactory.instance;
+
+ ObjectNode cca_obj = jf.objectNode();
+ long id = json.get("id").asLong();
+ String username = json.get("user").get("screen_name").textValue();
+ String url = AsyncHTMLStatusBlockCrawler.getUrl(id, username);
+ cca_obj.put("url", url);
+ cca_obj.put("timestamp", System.currentTimeMillis());
+ cca_obj.putNull("request");
+
+ ObjectNode response = cca_obj.putObject("response");
+ response.put("status", "200");
+ response.put("server", "twitter.com");
+ response.putArray("headers").add(jf.arrayNode().add("Content-Type").add("text/html"));
+ response.put("body", tweet);
+
+ cca_obj.put("key", "com_twitter_" + DigestUtils.sha1Hex(url));
+
+ ArrayNode indices = cca_obj.arrayNode();
+ indices.addObject().put("crawl", "tweets");
+ cca_obj.set("indices", indices);
+
+ ArrayNode features = cca_obj.arrayNode();
+ features.addObject().replace("json", json);
+ cca_obj.set("features", features);
+ return cca_obj;
+ }
+}
diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java
new file mode 100644
index 0000000..ebc119c
--- /dev/null
+++ b/twitter-tools-core/src/main/java/cc/twittertools/download/CrawlerOutputWriter.java
@@ -0,0 +1,8 @@
+package cc.twittertools.download;
+
+public interface CrawlerOutputWriter {
+
+ void open() throws Exception;
+ void write(String s) throws Exception;
+ void close() throws Exception;
+}
diff --git a/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java
new file mode 100644
index 0000000..b2e733e
--- /dev/null
+++ b/twitter-tools-core/src/main/java/cc/twittertools/download/GZipJSONOutput.java
@@ -0,0 +1,40 @@
+package cc.twittertools.download;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.util.zip.GZIPOutputStream;
+
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import cc.twittertools.corpus.data.HTMLStatusExtractor;
+
+public class GZipJSONOutput implements CrawlerOutputWriter {
+
+ protected File output_file;
+ protected OutputStreamWriter out;
+ protected HTMLStatusExtractor ext;
+
+ public GZipJSONOutput(File output_file) {
+ this.output_file = output_file;
+ ext = new HTMLStatusExtractor();
+ }
+
+ @Override
+ public void open() throws Exception {
+ out = new OutputStreamWriter(new GZIPOutputStream(
+ new FileOutputStream(output_file)), "UTF-8");
+ }
+
+ @Override
+ public void write(String s) throws Exception {
+ ObjectNode json = ext.extractTweet(s);
+ out.write(json.toString() + "\n");
+ }
+
+ @Override
+ public void close() throws Exception {
+ out.close();
+ }
+
+}