blob: 6ad241c42dc6f1e0bf8e854cb4959f7e0e329a52 [file] [log] [blame]
#!/usr/bin/perl
# Copyright (c) 2006 Eclipse Foundation, made available under EPL v1.0
# Contributors Ward Cunningham, Bjorn Freeman-Benson
#
# usage:
# get-companies.pl
# vi companies-alternate.txt
# extract.pl | parse.pl > TAB_FILE
#
use strict;
my $prefix;
my $line;
my $separator = "----------------------------\n";
my $terminator = "=============================================================================\n";
my %types = (
'' => 'none',
'htm' => 'html',
'jpg' => 'jpeg'
);
my @keys = (
'DATE','YEAR','YEARMONTH','YEARMONTHDAY',
'TOPPROJECT','PROJECT','FILENAME','FILETYPE',
'REVISION','CHANGE_SIZE','MESSAGE_SIZE',
'LOGIN','COMPANY'
);
my %values;
my %typesx;
my %progress;
my %companies;
my @companydata = split /\n/, `cat companies.txt companies-alternate.txt`;
for my $each ( @companydata ) {
my @wrds = split /\t+/, $each;
$companies{$wrds[0]} = $wrds[1];
}
print "#", map("$_\t", @keys), "\n";
while ($_ = <>){
chomp;
process();
}
sub process {
($values{PROJECT}, $values{TOPPROJECT}, $prefix) = /^(([^\.\t]+).*)\t(.*)\/[^\/]+$/;
my $progress = $progress{$values{PROJECT}}++;
print stderr "\nparsing $values{PROJECT}\n" unless $progress;
print stderr "." unless $progress % 1000;
while(1) {
last unless $line = <>;
#print "A: " . $line;
last if $line =~ /Working file:/;
}
$line =~ /Working file: (.+?(\.(\w+))?)$/;
$values{FILENAME} = "$prefix/$1";
$values{FILETYPE} = "\L$3";
$values{FILETYPE} = $types{$values{FILETYPE}} if $types{$values{FILETYPE}};
#print stderr "new $values{FILETYPE}\n" unless $typesx{$values{FILETYPE}}++;
$line = <>;
#print "B: " . $line;
while(1) {
while(1) {
last if $line eq $separator or $line eq $terminator;
last unless $line = <>;
#print "B2: " . $line;
}
last if $line eq $terminator;
last unless $line = <>;
#print "C: " . $line;
($values{REVISION}) = $line =~ /revision ([\d\.]+)/;
last unless $line = <>;
#print "D: " . $line;
$line =~ /date: ((....)\/(..)\/(..)[^;]+);\s+author: ([\w\.\-_]+);/;
$values{DATE} = $1;
$values{YEAR} = $2;
$values{YEARMONTH} = "$2$3";
$values{YEARMONTHDAY} = "$2$3$4";
$values{LOGIN} = $5;
$values{CHANGE_SIZE} = 0;
if( $line =~ /lines: \+(\d+) \-(\d+)/ ) {
$values{CHANGE_SIZE} = $1 + $2;
}
$values{MESSAGE_SIZE} = 0;
my %words;
while(1) {
last unless $line = <>;
#print "E: " . $line;
last if $line eq $separator or $line eq $terminator;
$values{MESSAGE_SIZE} += length $line;
my @words = $line =~ /(\w{5,})/g;
for (@words) {
$words{"\L$_"}++;
}
}
next if $values{FILENAME} =~ /\'/;
# our database code cannot handle
# single quotes so ignore files with those characters
$values{COMPANY} = $companies{$values{LOGIN}};
$values{COMPANY} = 'unknown' unless $values{COMPANY};
print map("$values{$_}\t", @keys), "\n";
my @words = sort keys %words;
next unless @words;
open (P, ">>words/$values{PROJECT}");
print P $values{YEARMONTHDAY}, map (" $_", @words), "\n";
}
}