hbase-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Derek Pappas <depap...@yahoo.com>
Subject Re: production usage of HBase
Date Sun, 18 Jan 2009 08:16:52 GMT
//package com.yoterra.se.afp;

import java.io.IOException;

//
// Sample Hbase	data importer
// reads from a file called cdr.data and injects into the cdrs table  
reporting the date / time every 1000 commits
//   by	  Anders Brownworth
import java.io.*;
import java.util.*;
//import org.apache.log4j.BasicConfigurator;
//import org.apache.log4j.Level;
//import org.apache.log4j.Logger;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.io.*;
import org.apache.hadoop.hbase.client.*;

public class ArcParserImporter {
	
     private static String rl( FileInputStream in ) {
		try {
		int i;
		char c[] = new char[1];
		String s = new String( "" );
		while ( ( i = in.read( )) != 10 )
		    if( i == -1 ) return null;
		    else {
		    	c[0] = (char) i;
		    	s = s + new String( c );
                     }
		//    System.out.println( i + "=" + s );
		//c[0] = (char) i;
		//s = s + new String( c );
		return s;

		}
		catch ( IOException ex ) {
	   	 ex.printStackTrace( );
		return null;
		}
	}

     public static void main( String args[] ) throws IOException {
	System.out.print( "starting " + new Date() + "..." );

	try {

     	    	int counter = 0;
		int act =0;
		int limit=0;
		int exno=0;
		boolean errf=false;

                 if (args.length == 0) {
                     System.out.println("[ERROR] exiting program - no  
arc file passed as arg");
                     System.exit(1);
                 }

	    	FileInputStream in = new FileInputStream( args[0] ) ;
	    	String line;
	
   	    	// read header and forget about it
	    	// first line has 5 token, the one before 0
	  	// System.out.println( "Header ... " );
	    	int i = 0;
	    	int t = 0;
	    	char[] c = new char[1];
	    	do {
			line = rl( in  );
			StringTokenizer st = new StringTokenizer( line, " " );
			i++;
			t = st.countTokens( );
		// System.out.println( "^^^" + t );
	    	} while ( t != 0 );

	    	// should be done with header, now for the interesting part

	  	//Let's open the HBase connection...
		HBaseConfiguration hc = new HBaseConfiguration( new  
Configuration( ) );
         	HTable ht = new HTable( hc, "yotest1" );
		// We have table handle now and can start to loop over the
		// Arc file content.

	     	do {
			StringTokenizer st = new StringTokenizer( line, " " );
			if ( st.countTokens( ) == 5  ) {
		    		String url = st.nextToken( );
		    		String ip = st.nextToken( );
		    		String ts = st.nextToken( );
		    		String mime = st.nextToken( );
		    		String len = st.nextToken( );
		//		System.out.println(  url + " " + ip + " " + ts + " " + mime + "  
" + len   );
		    		java.util.Scanner sk = new java.util.Scanner( len );
		    		try {
		    			limit = sk.nextInt();
		    			byte[] body = new byte[limit]; //-line.length()];
		    			act = in.read( body, 0 , limit); //-line.length());
					// We have read the header and body now if it is html we
					// can do the clustering process and write the result into hbase
					if( mime.compareTo( "text/html" ) == 0 ) {
						counter++;
						// write it into hbase now
						errf = false;
						exno = 0;
						do {
							try {
								BatchUpdate bu = new BatchUpdate( url );
                                                			 
bu.put( "crawltime:", ts.getBytes() );
                                                			bu.put( "ip:",  
ip.getBytes() );
                                                			bu.put( "mime:",  
mime.getBytes() );
    	                                      			// need to parse first  
bu.put( "respcode:", body );
         	                               			// need to calculate first  
bu.put( "offset:", etBytes() );
                 	                       			bu.put( "size:",  
len.getBytes() );
                                              			// bu.put( "file:",  
arc.getBytes() );
                                                			bu.put( "resp:",  
body );
					        		String clno = "" + counter;
								bu.put( "clusterno:", clno.getBytes() );
                                        				ht.commit( bu );
							} catch ( IOException aex )
							{
								exno++;
								System.out.println( "IO Exception No=" + exno );
								try {
									Thread.sleep( 60000 );
								} catch( InterruptedException ee )
								{
									System.out.println( "Time to exit..." );
									System.exit(1);
								}
								hc = new HBaseConfiguration( new Configuration( ) );
								ht = new HTable( hc, "yotest1" );
								errf = true;
							}
						} while ( errf );
					}

				}			
				catch ( InputMismatchException ex )  
{ System.out.println( "Odd..." ); }
//`	 			catch (SimpleHtmlParserException e) { e.printStackTrace(); }
			}	
		} while ( ( line = rl( in ) ) != null );
		System.out.println( "Wrote " + counter + " pages " + new Date() );
	} catch ( IOException ex ) { ex.printStackTrace( ); }
	//    	} catch (Exception e) {
	//     	e.printStackTrace();
	//    	}

     }

}

On Jan 17, 2009, at 3:37 PM, stack wrote:

> Derek Pappas wrote:
>> No. See attached program. It parses the arc files and writes the  
>> html records to hbase.
>> 5 data nodes and 3 regions.
>>
> I don't believe this list allows attachments (Program did not come  
> across).  Put it up in pastebin?
>
>>
>> Single threaded.
>>
> How many instances do you have running?  One only?
>
>
>>> Tell us what you are seeing in your logs so we can help.  Make  
>>> sure you have DEBUG enabled (see earlier in the FAQ that J-D  
>>> pointed you at for how).
>>>
>>> Errors posted below, datanodes complaining of blocks, as J-D  
>>> indicates, should be addressed mostly by the troubleshooting  
>>> section he pointed you to.  You might also check datanode logs for  
>>> errors.  Could help give us a clue why the failures.
>>>
>>> Meantime, how many regions when it fails?  Tell us about your  
>>> schema and your hardware.
>>
>> Dell 850's. Super Micro core duo's and a quad core.
>>
>> 5 data nodes 3 regions
>
> Add your configuration to pastebin too.  Whats your schema like?   
> How many column families?
>
> Anything else running on these systems?  They should be well able  
> (How much RAM -- are you swapping?).
>
> Enable DEBUG and paste exceptions from regionserver logs including  
> the lines that lead up to the exception.
>
> Check your datanode logs too.
>
> St.Ack

Best Regards,

Derek Pappas
depappas at yahoo d0t com





Mime
View raw message