blob: aee3103a8d17d70142092989a0a311f745f273fb [file] [log] [blame]
#!/usr/bin/php
<?php
/*******************************************************************************
* Copyright (c) 2010 Eclipse Foundation and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Wayne Beaton (Eclipse Foundation)- initial API and implementation
*******************************************************************************/
/**
* This script parses the output of the git log command (with the --numstat
* option to include file names) and generates the tabular output format
* required by Dash.
*
* Usage:
*
* git --git-dir=/gitroot/woolsey/iplog/.git/ log --numstat | ./git-parse.php
*
* Examples:
* echo -e "technology.dash.woolsey\t/gitroot/woolsey/iplog/.git/" | ./git-extract.php | ./git-parse.php
* wget http://www.eclipse.org/projects/web-api/roots-generator.php?type=git -O - 2> /dev/null | ./git-extract.php | ./git-parse.php
* wget http://www.eclipse.org/projects/web-api/roots-generator.php?type=git -O - 2> /dev/null | grep virgo | ./git-extract.php | ./git-parse.php | ./chunk.pl ./insert.pl
*
* @author Wayne Beaton
*/
$project = 'unknown';
$top = 'unknown';
$commit = 'unknown';
$login = 'unknown';
$company = 'unknown';
$date = 'unknown';
$year = 'unknown';
$yearmonth = 'unknown';
$yearmonthday = 'unknown';
$headers = array(
"DATE", "YEAR", "YEARMONTH","YEARMONTHDAY",
"TOPPROJECT","PROJECT","FILENAME","FILETYPE","REVISION",
"CHANGE_SIZE","MESSAGE_SIZE","LOGIN","COMPANY");
$projectNameSegmentPattern = "[a-zA-Z0-9\\-]+";
$projectNamePattern = "(([\w\-]+)(\.[\w\-]+){0,2})";
echo "#" . implode("\t", $headers) . "\n";
/*
* The logic here is this: We encounter the information
* in a line-by-line manner. As we encounter the basic information
* (like the committer identity, commit Id, etc), we remember it.
* As we encounter file names, we use the information we've collected
* so far to output a record for that file name.
*
* We assume that all the appropriate header information is always
* included as part of each block.
*
* Data comes to us in this form:
* ---------------------------------------------------------
* Project: technology.dash.woolsey
* commit 4e918dade5701c6cf03ad3608489ce00738fc8b5
* Author: Wayne Beaton <wayne@eclipse.org>
* AuthorDate: Thu Dec 16 23:53:58 2010 -0500
* Commit: Wayne Beaton <wayne@eclipse.org>
* CommitDate: Thu Dec 16 23:53:58 2010 -0500
*
* Bug 332692 Added support to capture date as part of submit request. Also added some rudimentary testing of input data and error reporting.
*
* 2 1 org.eclipse.woolsey.iplog.submit/META-INF/MANIFEST.MF
* 4 1 org.eclipse.woolsey.iplog.submit/futz.jpage
* 13 8 org.eclipse.woolsey.iplog.submit/src/org/eclipse/woolsey/iplog/submit/IpzillaClient.java
* 40 5 org.eclipse.woolsey.iplog.submit/src/org/eclipse/woolsey/iplog/submit/wizards/SubmitInfoWizardPage.java
* 60 8 org.eclipse.woolsey.iplog.submit/src/org/eclipse/woolsey/iplog/submit/wizards/SubmitOperation.java
* ---------------------------------------------------------
*
* So, as we encounter the Project, commit, Author, and Date lines, we
* record that information. It's only when we hit the file name lines
* (at the bottom) that we actually write anything out.
*/
while (!feof(STDIN)) {
$line = fgets(STDIN);
// For example:
// Project: technology.dash.woolsey
if (preg_match("/^Project: (([\w\-]+)(\.[\w\-]+){0,2})$/i", $line, $matches)) {
$project = $matches[1];
$top = $matches[2];
// Just to be safe and make sure that information doesn't
// bleed from one project to the next, we reset everything.
$commit = 'unknown';
$login = 'unknown';
$company = 'unknown';
$date = 'unknown';
$year = 'unknown';
$yearmonth = 'unknown';
$yearmonthday = 'unknown';
}
// For example:
// commit 4e918dade5701c6cf03ad3608489ce00738fc8b5
if (preg_match('/^commit ([a-f0-9]+)$/i', $line, $matches)) {
$commit = $matches[1];
// Just to be safe and make sure that information doesn't
// bleed from one commit to the next, we reset everything.
$login = 'unknown';
$company = 'unknown';
$date = 'unknown';
$year = 'unknown';
$yearmonth = 'unknown';
$yearmonthday = 'unknown';
}
// Note that the committer could be an email address or
// a committer id (we expect that it mostly likely an email
// address.
//
// For example:
// Commit: Wayne Beaton <wayne@eclipse.org>
// Commit: wbeaton
// Commit: spingel <>
// Commit: Steffen Pingel <steffen.pingel@tasktop.com>
// Commit: steffen.pingel@tasktop.com
if (preg_match('/^Commit:/', $line)) {
// First, look for a name between <>
if (preg_match('/<([^>]+)>/i', $line, $matches)) {
$login = getCommitterId(trim($matches[1]));
$company = getCommitterCompany($login);
} else
// Then, try to grab the first word after "Commit:" and hope for the best.
if (preg_match('/^Commit:\s*(\w+)/', $line, $matches)) {
$login = getCommitterId($matches[1]);
$company = getCommitterCompany($login);
}
// Otherwise, fail miserably.
}
// For example:
// CommitDate: Thu Dec 16 23:53:58 2010 -0500
if (preg_match('/^CommitDate: (.*)$/', $line, $matches)) {
$date = strtotime(trim($matches[1]));
$year = date('Y', $date);
$yearmonth = date('Ym', $date);
$yearmonthday = date('Ymd', $date);
$date = date("Y/m/d", $date);
}
// For example:
// 2 1 org.eclipse.woolsey.iplog.submit/META-INF/MANIFEST.MF
if (preg_match('/^([0-9]+)\s+([0-9]+)\s+(\w.*)$/', $line, $matches)) {
if ((int)$year < 2000) continue; // Ignore weird data (see Bug 333620).
$added = $matches[1];
$removed = $matches[2];
$change_size = $added + $removed;
$message_size = 0;
$filename = $matches[3];
$filetype = getFileType($filename);
echo "$date\t$year\t$yearmonth\t$yearmonthday\t$top\t$project\t$filename\t$filetype\t$commit\t$change_size\t$message_size\t$login\t$company\n";
}
}
// TODO Move to a "common" import
function getFileType($filename) {
if (preg_match('/.*\.(\w+)$/', $filename, $matches)) {
$extension = $matches[1];
if ($extension == 'htm') return 'html';
if ($extension == 'jpg') return 'jpeg';
return $extension;
}
return 'unknown';
}
function getCommitterId($address) {
$address = strtolower($address);
// If it's an email address, translate to a committer id.
$map = getEmailToCommitterMap();
if (isset($map[$address])) {
$id = $map[$address];
return $id;
}
/*
* If it looks like a committer id followed by a UUID, then
* it's probably a commit that's been migrated from CVS. Grab
* the committer id.
*
* e.g. dschaefer@6a79697e-3843-0410-8446-a9668620458d
*/
if (preg_match('/^(\w+)@[\w\-]+$/', $address, $matches)) {
return $matches[1];
}
return $address;
}
function &getEmailToCommitterMap() {
global $_emailToCommitterMap;
if ($_emailToCommitterMap) return $_emailToCommitterMap;
$_emailToCommitterMap = array();
$file = fopen('https://www.eclipse.org/projects/web-api/email-id-map.php', 'r');
if (!$file) return;
while (!feof($file)) {
$line = fgets($file);
$parts = split("\t", $line);
$email = strtolower(trim($parts[0]));
$id = trim($parts[1]);
$_emailToCommitterMap[$email] = $id;
}
fclose($file);
return $_emailToCommitterMap;
}
function getCommitterCompany($id) {
$id = strtolower($id);
$map = getCommitterToCompanyMap();
if (isset($map[$id])) {
$company = $map[$id];
if ($company) return $company;
}
return 'unknown';
}
function &getCommitterToCompanyMap() {
global $_committerToCompanyMap;
if ($_committerToCompanyMap) return $_committerToCompanyMap;
$_committerToCompanyMap = array();
$file = fopen('https://www.eclipse.org/projects/web-api/commit-companies.php', 'r');
if (!$file) return;
while (!feof($file)) {
$line = fgets($file);
$parts = split("\t", $line);
$id = strtolower(trim($parts[0]));
$company = trim($parts[1]);
$_committerToCompanyMap[$id] = $company;
}
fclose($file);
return $_committerToCompanyMap;
}
?>