package webdb; use strict; use MIME::Base64; use LWP::UserAgent; use HTTP::Response; use BAT 'ARC'; use XML::Simple; use Data::Dumper; use Digest::SHA1 qw(sha1_hex); use IO::Zlib; #use vars qw(@EXPORT); #@EXPORT = qw(load_arc); sub validate_file { my ( $file, $user, $provided_key ) = @_; my $key = sha1_hex( "HASH SECRET" ); my $arc; return "Error: Uploaded file is not a compressed arc file" unless $arc = new ARC(gzip=>1, filename=>$file); while ( my $record = $arc->next_record() ) { if ( $record->{name} =~ /http(.?)\:\/\/([^\/]+)(.*)/ ) { my $path = $3 || "/"; my $host = ($1 eq "s" && $2 !~ /\:/)?"$2:443":$2; $key = sha1_hex( "$key $host $path" ); } else { return "Error: Bad record found in arc file, I'm not going to accept this file"; } } $key = sha1_hex( "$key $user" ); if ( $key eq $provided_key ) { return "Success: Keys match"; } else { return "Error: Your key is invalid, I'm not going to accept this file"; } } sub rewrite_file { my ( $filein, $fileout ) = @_; my $arc; return "Error: Uploaded file is not a compressed arc file" unless $arc = new ARC(gzip=>1, filename=>$filein); open INDEX, ">$fileout.idx" or return "Error: cannot open file $!"; my @parts = split( /\//, $fileout ); my $filenameonly = pop @parts; my $output = IO::Zlib->new($fileout, "wb") or return "Error: cannot open file $!"; print $output <next_record() ) { my $offset_start = (stat($fileout))[7]; $output = IO::Zlib->new($fileout, "ab"); print $output $record->{name} . " " . $record->{ip} . " " . $record->{record_date} . " " . $record->{mime} . " " . (length( $record->{content} )+1) . "\n" . $record->{content} . "\n"; close( $output ); my $offset_end = (stat($fileout))[7]; my $len = $offset_end - $offset_start; # print "Offset: $offset_start Len: $len\n"; print INDEX $record->{name} . " $offset_start $len\n"; } close INDEX; return "Success"; } sub format_columns { my ($family, %fields) = @_; my $ret; while( my ($key, $value) = each %fields ) { $ret .= "$family:$key" . encode_base64( $value ) . "\n"; } return $ret; } sub load_arc { my ( $file, $user, $timestamp ) = @_; my $ua = LWP::UserAgent->new; $ua->agent( "GrubLoader/0.1" ); my $encoded_file = encode_base64( $file ); my $arc; return "Error: HBase Loader: Can't open your ARC file, is it mailformed?" unless $arc = new ARC(gzip=>1, filename=>$file); my @keys_added; while ( my $record = $arc->next_record() ) { if ( !$record->{name} ) { #print "Poorly formed record (no URL), skipping\n"; next; } # Pull the protocol part off my @parts = split( /:\/\//, $record->{name} ); shift @parts; my $url_only = join( "://", @parts ); # Seperate off the domain name my @parts = split( /\//, $url_only ); my $domain = shift @parts; my $path = join( "/", @parts ); # Reverse the domain name parts my $domain_key; if ( $domain =~ /\d+\.\d+\.\d+\.\d+/ ) { $domain_key = $domain; } else { my @parts = split( /\./, $domain ); $domain_key = join( ".", reverse @parts ); } my $key = $domain_key . "/" . $path; if ( $record->{name} =~ /\/$/ && $path ) { $key .= "/"; } push( @keys_added, "$timestamp:" . encode_base64( $key ) ); $key =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; # Fields for the metat column family my %meta_fields = ( origURL => "", postedBy => "", sourceFile => "", headerContentType => "", headerLastModified => "", headerETag => "", httpResultCode => "", ); $meta_fields{origURL} = $record->{name}; $meta_fields{postedBy} = $user; $meta_fields{sourceFile} = $file; my $result = HTTP::Response->parse( $record->{content} ); if ( $result ) { $meta_fields{headerContentType} = $result->header( "Content-type" ); $meta_fields{headerLastModified} = $result->header( "Last-Modified" ); $meta_fields{headerETag} = $result->header( "ETag" ); $meta_fields{httpResultCode} = $result->code; $meta_fields{responseHeaders} = $result->headers_as_string( "\n" ); } my $encoded_page = encode_base64( $record->{content} ); # # document:content # $encoded_page # my $document =< ENDDOC $document .= format_columns( "meta", %meta_fields ); $document .=< ENDDOC my $req = HTTP::Request->new( POST => "http://gateway.isc.swlabs.org:60010/api/webdb/row/$key/$timestamp" ); $req->header('Accept' => 'application/xml'); $req->content( $document ); my $res = $ua->request($req); if ( !$res->is_success() ) { return "Error: HBase Loader: Sent:\n$document\n\nGot:\n" . $res->as_string . "\n"; #die; } } # Record what we just did into the audit table my $encoded_file = encode_base64( $file ); my $encoded_urls = encode_base64( join( ",", @keys_added ) ); my $document =< grubUploads:file $encoded_file grubUploads:urls $encoded_urls ENDDOC my $escapedUser = $user; $escapedUser =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; my $req = HTTP::Request->new( POST => "http://gateway.isc.swlabs.org:60010/api/userAudit/row/$escapedUser/$timestamp" ); $req->header('Accept' => 'application/xml'); $req->content( $document ); my $res = $ua->request($req); if ( !$res->is_success() ) { return "Error: HBase Loader: Sent:\n$document\n\nGot:\n" . $res->as_string . "\n"; #die; } return "Success"; } 1