/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.text.wikipedia;

import com.google.common.io.Closeables;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.text.DecimalFormat;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class WikipediaXmlSplitter {
    private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class);

    private WikipediaXmlSplitter() {
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void main(String[] args) throws IOException {
        FileLineIterator it;
        CommandLine cmdLine;
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();
        DefaultOption dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d").create();
        DefaultOption outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription("The output directory to place the splits in:\nlocal files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\nHadoop DFS:\n\thdfs://wikipedia-xml-chunks\nAWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\nAWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n").withShortName("o").create();
        DefaultOption s3IdOpt = obuilder.withLongName("s3ID").withRequired(false).withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key").withShortName("i").create();
        DefaultOption s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false).withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 secret key").withShortName("s").create();
        DefaultOption chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
        DefaultOption numChunksOpt = obuilder.withLongName("numChunks").withRequired(false).withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()).withDescription("The maximum number of chunks to create.  If specified, program will only create a subset of the chunks").withShortName("n").create();
        Group group = gbuilder.withName("Options").withOption((Option)dumpFileOpt).withOption((Option)outputDirOpt).withOption((Option)chunkSizeOpt).withOption((Option)numChunksOpt).withOption((Option)s3IdOpt).withOption((Option)s3SecretOpt).create();
        Parser parser = new Parser();
        parser.setGroup(group);
        try {
            cmdLine = parser.parse(args);
        }
        catch (OptionException e) {
            log.error("Error while parsing options", (Throwable)e);
            CommandLineUtil.printHelp((Group)group);
            return;
        }
        Configuration conf = new Configuration();
        String dumpFilePath = (String)cmdLine.getValue((Option)dumpFileOpt);
        String outputDirPath = (String)cmdLine.getValue((Option)outputDirOpt);
        if (cmdLine.hasOption((Option)s3IdOpt)) {
            String id = (String)cmdLine.getValue((Option)s3IdOpt);
            conf.set("fs.s3n.awsAccessKeyId", id);
            conf.set("fs.s3.awsAccessKeyId", id);
        }
        if (cmdLine.hasOption((Option)s3SecretOpt)) {
            String secret = (String)cmdLine.getValue((Option)s3SecretOpt);
            conf.set("fs.s3n.awsSecretAccessKey", secret);
            conf.set("fs.s3.awsSecretAccessKey", secret);
        }
        conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
        FileSystem fs = FileSystem.get((URI)URI.create(outputDirPath), (Configuration)conf);
        int chunkSize = 0x100000 * Integer.parseInt((String)cmdLine.getValue((Option)chunkSizeOpt));
        int numChunks = Integer.MAX_VALUE;
        if (cmdLine.hasOption((Option)numChunksOpt)) {
            numChunks = Integer.parseInt((String)cmdLine.getValue((Option)numChunksOpt));
        }
        String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n  <siteinfo>\n<sitename>Wikipedia</sitename>\n    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n    <generator>MediaWiki 1.13alpha</generator>\n    <case>first-letter</case>\n    <namespaces>\n      <namespace key=\"-2\">Media</namespace>\n      <namespace key=\"-1\">Special</namespace>\n      <namespace key=\"0\" />\n      <namespace key=\"1\">Talk</namespace>\n      <namespace key=\"2\">User</namespace>\n      <namespace key=\"3\">User talk</namespace>\n      <namespace key=\"4\">Wikipedia</namespace>\n      <namespace key=\"5\">Wikipedia talk</namespace>\n      <namespace key=\"6\">Image</namespace>\n      <namespace key=\"7\">Image talk</namespace>\n      <namespace key=\"8\">MediaWiki</namespace>\n      <namespace key=\"9\">MediaWiki talk</namespace>\n      <namespace key=\"10\">Template</namespace>\n      <namespace key=\"11\">Template talk</namespace>\n      <namespace key=\"12\">Help</namespace>\n      <namespace key=\"13\">Help talk</namespace>\n      <namespace key=\"14\">Category</namespace>\n      <namespace key=\"15\">Category talk</namespace>\n      <namespace key=\"100\">Portal</namespace>\n      <namespace key=\"101\">Portal talk</namespace>\n    </namespaces>\n  </siteinfo>\n";
        StringBuilder content = new StringBuilder();
        content.append(header);
        DecimalFormat decimalFormatter = new DecimalFormat("0000");
        File dumpFile = new File(dumpFilePath);
        if (!dumpFile.exists()) {
            log.error("Input file path {} doesn't exist", (Object)dumpFilePath);
            return;
        }
        if (dumpFilePath.endsWith(".bz2")) {
            BZip2Codec codec = new BZip2Codec();
            it = new FileLineIterator((InputStream)codec.createInputStream((InputStream)new FileInputStream(dumpFile)));
        } else {
            it = new FileLineIterator(dumpFile);
        }
        int fileNumber = 0;
        while (it.hasNext()) {
            String thisLine = (String)it.next();
            if (!thisLine.trim().startsWith("<page>")) continue;
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = (String)it.next();
                    continue;
                }
                end = true;
                break;
            }
            content.append(thisLine).append('\n');
            if (content.length() <= chunkSize && !end) continue;
            content.append("</mediawiki>");
            String filename = outputDirPath + "/chunk-" + decimalFormatter.format(++fileNumber) + ".xml";
            BufferedWriter chunkWriter = new BufferedWriter(new OutputStreamWriter((OutputStream)fs.create(new Path(filename)), "UTF-8"));
            try {
                chunkWriter.write(content.toString(), 0, content.length());
            }
            finally {
                Closeables.close((Closeable)chunkWriter, (boolean)false);
            }
            if (fileNumber >= numChunks) break;
            content = new StringBuilder();
            content.append(header);
        }
    }
}

