| #!/usr/bin/perl |
| # Copyright (c) 2006 Eclipse Foundation, made available under EPL v1.0 |
| # Contributors Ward Cunningham, Bjorn Freeman-Benson |
| # |
| # usage: |
| # get-companies.pl |
| # vi companies-alternate.txt |
| # extract.pl | parse.pl > TAB_FILE |
| # |
| |
| use strict; |
| |
| my $prefix; |
| my $line; |
| |
| my $separator = "----------------------------\n"; |
| my $terminator = "=============================================================================\n"; |
| |
| my %types = ( |
| '' => 'none', |
| 'htm' => 'html', |
| 'jpg' => 'jpeg' |
| ); |
| |
| my @keys = ( |
| 'DATE','YEAR','YEARMONTH','YEARMONTHDAY', |
| 'TOPPROJECT','PROJECT','FILENAME','FILETYPE', |
| 'REVISION','CHANGE_SIZE','MESSAGE_SIZE', |
| 'LOGIN','COMPANY' |
| ); |
| my %values; |
| my %typesx; |
| my %progress; |
| |
| my %companies; |
| my @companydata = split /\n/, `cat companies.txt companies-alternate.txt`; |
| for my $each ( @companydata ) { |
| my @wrds = split /\t+/, $each; |
| $companies{$wrds[0]} = $wrds[1]; |
| } |
| |
| print "#", map("$_\t", @keys), "\n"; |
| |
| while ($_ = <>){ |
| chomp; |
| process(); |
| } |
| |
| sub process { |
| |
| ($values{PROJECT}, $values{TOPPROJECT}, $prefix) = /^(([^\.\t]+).*)\t(.*)\/[^\/]+$/; |
| |
| my $progress = $progress{$values{PROJECT}}++; |
| print stderr "\nparsing $values{PROJECT}\n" unless $progress; |
| print stderr "." unless $progress % 1000; |
| |
| |
| while(1) { |
| last unless $line = <>; |
| #print "A: " . $line; |
| last if $line =~ /Working file:/; |
| } |
| $line =~ /Working file: (.+?(\.(\w+))?)$/; |
| $values{FILENAME} = "$prefix/$1"; |
| $values{FILETYPE} = "\L$3"; |
| $values{FILETYPE} = $types{$values{FILETYPE}} if $types{$values{FILETYPE}}; |
| #print stderr "new $values{FILETYPE}\n" unless $typesx{$values{FILETYPE}}++; |
| |
| $line = <>; |
| #print "B: " . $line; |
| while(1) { |
| while(1) { |
| last if $line eq $separator or $line eq $terminator; |
| last unless $line = <>; |
| #print "B2: " . $line; |
| } |
| last if $line eq $terminator; |
| last unless $line = <>; |
| #print "C: " . $line; |
| ($values{REVISION}) = $line =~ /revision ([\d\.]+)/; |
| |
| last unless $line = <>; |
| #print "D: " . $line; |
| $line =~ /date: ((....)\/(..)\/(..)[^;]+);\s+author: ([\w\.\-_]+);/; |
| $values{DATE} = $1; |
| $values{YEAR} = $2; |
| $values{YEARMONTH} = "$2$3"; |
| $values{YEARMONTHDAY} = "$2$3$4"; |
| $values{LOGIN} = $5; |
| $values{CHANGE_SIZE} = 0; |
| if( $line =~ /lines: \+(\d+) \-(\d+)/ ) { |
| $values{CHANGE_SIZE} = $1 + $2; |
| } |
| $values{MESSAGE_SIZE} = 0; |
| my %words; |
| while(1) { |
| last unless $line = <>; |
| #print "E: " . $line; |
| last if $line eq $separator or $line eq $terminator; |
| $values{MESSAGE_SIZE} += length $line; |
| my @words = $line =~ /(\w{5,})/g; |
| for (@words) { |
| $words{"\L$_"}++; |
| } |
| } |
| next if $values{FILENAME} =~ /\'/; |
| # our database code cannot handle |
| # single quotes so ignore files with those characters |
| $values{COMPANY} = $companies{$values{LOGIN}}; |
| $values{COMPANY} = 'unknown' unless $values{COMPANY}; |
| print map("$values{$_}\t", @keys), "\n"; |
| my @words = sort keys %words; |
| next unless @words; |
| open (P, ">>words/$values{PROJECT}"); |
| print P $values{YEARMONTHDAY}, map (" $_", @words), "\n"; |
| } |
| } |