/** * This version is to scrape data from a specific web page. * Copyright (C) 2007 Andy Turner, CCG, University of Leeds, UK. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ package uk.ac.leeds.ccg.andyt.web; /* * Scaper.java * * Created on 26 February 2007, 15:58 * * To change this template, choose Tools | Template Manager * and open the template in the editor. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.StreamTokenizer; import java.net.HttpURLConnection; import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; public class Scraper { /** Creates a new instance of Scaper */ public Scraper() { } /** Main method */ public static void main( String[] args ) throws Exception { Scraper aScraper = new Scraper(); aScraper.run( args ); } public void run( String[] args ) throws Exception { //getHTML( "http://www.houseprices.co.uk/e.php?q=LS7+2EU", "LS7", "2EU" ); //getHousepriceDataForGary(); if ( args.length == 0 ) { args = new String[1]; args[0] = "Houseprices.csv"; } getHousepriceDataForStuart( args ); } /** * Stuart wants data for "LS7 1" and "LS2 9" */ public void getHousepriceDataForStuart( String[] args ) throws Exception { File outFile = new File( args[0] ); //FileOutputStream aFileOutputStream = new FileOutputStream( outFile ); PrintWriter aPrintWriter = new PrintWriter( outFile ); HashSet tPostcodes = new HashSet(); HashMap alphabet = new HashMap(); alphabet.put(0,"A"); alphabet.put(1,"B"); alphabet.put(2,"C"); alphabet.put(3,"D"); alphabet.put(4,"E"); alphabet.put(5,"F"); alphabet.put(6,"G"); alphabet.put(7,"H"); alphabet.put(8,"I"); alphabet.put(9,"J"); alphabet.put(10,"K"); alphabet.put(11,"L"); alphabet.put(12,"M"); alphabet.put(13,"N"); alphabet.put(14,"O"); alphabet.put(15,"P"); alphabet.put(16,"Q"); alphabet.put(17,"R"); alphabet.put(18,"S"); alphabet.put(19,"T"); alphabet.put(20,"U"); alphabet.put(21,"V"); alphabet.put(22,"W"); alphabet.put(23,"X"); alphabet.put(24,"Y"); alphabet.put(25,"Z"); for ( int i = 0; i < 25; i ++ ) { for ( int j = 0; j < 25; j ++ ) { tPostcodes.add( "LS7 1" + ( String ) alphabet.get( i ) + ( String ) alphabet.get( j ) ); tPostcodes.add( "LS2 9" + ( String ) alphabet.get( i ) + ( String ) alphabet.get( j ) ); } } Iterator tPostcodesIterator = tPostcodes.iterator(); while ( tPostcodesIterator.hasNext() ) { String aPostcode = ( String ) tPostcodesIterator.next(); String[] splitPostcode = aPostcode.split( " " ); String aURLString = new String( "http://www.houseprices.co.uk/e.php?q=" + splitPostcode[0] + "+" + splitPostcode[1] ); HashSet tHouseprices = getHTML( aURLString, splitPostcode[0], splitPostcode[1] ); Iterator aIterator = tHouseprices.iterator(); while ( aIterator.hasNext() ) { aPrintWriter.write( ( String ) aIterator.next() ); aPrintWriter.println(); } aPrintWriter.flush(); System.out.println( "Done " + aURLString ); } } public void getHousepriceDataForGary() throws Exception { File aFile = new File( "C:/Work/organisations/UoL/SoG/Students/Gary Wainman/Hull_postcodes.csv" ); FileInputStream aFileInputStream = new FileInputStream( aFile ); File outFile = new File( "C:/Work/organisations/UoL/SoG/Students/Gary Wainman/Hull_houseprices.csv" ); //FileOutputStream aFileOutputStream = new FileOutputStream( outFile ); PrintWriter aPrintWriter = new PrintWriter( outFile ); StreamTokenizer aStreamTokenizer = new StreamTokenizer( aFileInputStream ); aStreamTokenizer.eolIsSignificant( true ); aStreamTokenizer.wordChars( ' ', ' ' ); aStreamTokenizer.wordChars( ',', ',' ); //Skip header first line aStreamTokenizer.nextToken(); while ( aStreamTokenizer.nextToken() != StreamTokenizer.TT_EOF ) { if ( aStreamTokenizer.ttype != StreamTokenizer.TT_EOL ) { String line = aStreamTokenizer.sval; String tFirstPartOfPostcode = line.substring( 0, 4 ); if ( tFirstPartOfPostcode.endsWith( " " ) ) { tFirstPartOfPostcode = line.substring( 0, 3 ); } String tSecondPartOfPostcode = line.substring( 4, 7 ); //System.out.println( "" + line ); //System.out.println( ); String aURLString = new String( "http://www.houseprices.co.uk/e.php?q=" + tFirstPartOfPostcode + "+" + tSecondPartOfPostcode ); HashSet tHouseprices = getHTML( aURLString, tFirstPartOfPostcode, tSecondPartOfPostcode ); Iterator aIterator = tHouseprices.iterator(); while ( aIterator.hasNext() ) { aPrintWriter.write( ( String ) aIterator.next() ); aPrintWriter.println(); } aPrintWriter.flush(); System.out.println( "Done " + aURLString ); } } //System.out.println( "" + result ); } public HashSet getHTML( String aURLString, String tFirstPartOfPostcode, String tSecondPartOfPostcode ) { HashSet result = new HashSet(); URL aURL; HttpURLConnection aHttpURLConnection; BufferedReader aBufferedReader; String line; String tFormattedOutput; try { aURL = new URL( aURLString ); aHttpURLConnection = ( HttpURLConnection ) aURL.openConnection(); aHttpURLConnection.setRequestMethod( "GET" ); aBufferedReader = new BufferedReader( new InputStreamReader( aHttpURLConnection.getInputStream() ) ); while ( ( line = aBufferedReader.readLine() ) != null ) { if ( line.startsWith( "" ); //System.out.println("items.length " + items.length ); tFormattedOutput = tFirstPartOfPostcode + " " + tSecondPartOfPostcode + "," + items[0].substring( items[0].length() - 10 ) + "," + items[1].split("£")[1].replace(",","") + "," + items[2].split(">")[1] + "," + items[3].substring( items[3].length() - 1 ) + "," + items[4].split(">")[1] + ",\"" + items[6].substring( 4 ) + "\""; //System.out.println( tFormattedOutput ); result.add( tFormattedOutput ); } } aBufferedReader.close(); } catch ( Exception aException ) { aException.printStackTrace(); } return result; } }