Here's a listing of the close used html scraper for dissimilar PL: http://stackoverflow.com/questions/2861/options-for-html-scraping
And I've chosen jsoup for coffee since it's the close simplest to implement, alongside minimal dependencies compared to HTMLUnit too the rest.
Here's how I've written my implementation:
package org.ipiel.ipielHtmlParser; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * @author Edward P. Legaspi * @since Jul 29, 2012 **/ populace cast JsoupParserImpl { populace static void main(String args[]) { endeavour { novel JsoupParserImpl(); } grab (IOException e) { // TODO Auto-generated grab block e.printStackTrace(); } } populace JsoupParserImpl() throws IOException { File input = novel File("input.html"); Document MD = Jsoup.parse(input, "UTF-8"); Elements birdNames = doc.select("p[class=MsoNormal]"); Iteratorite = birdNames.iterator(); PrintWriter pw = novel PrintWriter(new FileOutputStream("out.txt")); land (ite.hasNext()) { Element aeroplane = (Element) ite.next(); // comm cite + sci cite Element birdName = (Element) bird.select("span[class=comname]").first(); Element sciName = (Element) bird.select("span[class=sciname]").first(); List endemics = (List ) bird.select("span[class=endemic]"); Element endemic = null; if(endemics.size() > 0) { endemic = endemics.get(0); } Element place = (Element) ite.next(); // where flora String out = birdName.text().trim() + "," + sciName.text() + "," + ((endemic != null) ? endemic.text() : "") + "," + location.text(); System.out.println(out); pw.write(out); pw.write("\n"); ite.next(); // spacer } pw.close(); } }
0 komentar:
Please comment if there are any that need to be asked.