blob: 3ef4f7702d2ad5cb7989a8fe0cab96aeeb01d3b9 [file] [log] [blame]
<?php
/*******************************************************************************
* Copyright (c) 2019 Eclipse Foundation and others.
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*******************************************************************************/
/*
* This file is a prototype for executing a license scan of a bill of
* materials. The bill of materials is provided as plain text with each
* line containing the one unit of content (i.e., library). Blank lines
* and comment lines are skipped (comment lines start with "#" or "//").
*
* The id for a unit of content may be expressed as Maven coordinates of the form
* "groupid:artifactid[:packaging]:version", as abbridged Purl coordinates of the form
* "type/namespace/name@version", or as ClearlyDefined coordinates of the
* form "type/source/namespace/name/version". Mixing formats is supported.
*
* Output is expressed as ClearlyDefined ids.
*
* Two sources of information are used to map ids to license information and other
* metadata: first, the Eclipse Foundation data is consulted and then
* the ClearlyDefined services are called. Future versions of this script
* may consult other sources of data.
*
* usage example:
*
* curl -X POST http://localhost/projects/services/license_check.php \
* --data-urlencode content@maven.deps
*
* curl -X POST http://localhost/projects/services/license_check.php \
* -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
*
* curl -X POST "http://localhost/projects/services/license_check.php" \
* -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
*
* yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
* | curl -X POST "http://localhost/projects/services/license_check.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
* --data-urlencode content@- | jsonpp | less
*
* Note that this works with an instance running on localhost.
*
* TODO Validate that we can scale to ~4K lines
* TODO support file upload.
* TODO Customize license list based on project (e.g., science.* can use LGPL)
* TODO Parameters for default type and provider.
*/
/**
* Try to massage the content identifier into ClearlyDefined coordinates. That is,
* for example, recognize Maven coordinates of the form <em>groupid:artifactid:version</em>, and
* convert them into the ClearlyDefined equivalent, <em>maven:mavencentral:groupid:artifactid:version</em>.
*
* Answers <code>null<code> when the id is not recognized.
*
* @param string $id
* @return NULL|string
*/
function normalizeId($id) {
$matches = null;
// Just pass through anything that's already in a ClearlyDefined coordinate form.
if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
/*
* Deal with Maven coordinates. There is a special case that we need to deal with
* because of Tycho using p2 repositories. When the coordinates start with
* "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
*/
if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
/*
* Some p2 cases (org.apache.ant only, I think) provide all of the information that
* we need to match against Maven directly.
*
* e.g. "p2.eclipse-plugin:org.apache.ant:jar:lib/ant-jsch.jar:1.10.5.v20190526-1402"
* maps to "maven/mavencentral/org.apache.ant/ant-jsch/1.10.5"
*/
if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
/*
* Deal with pURL coordinates. These take the form "namespace/name@version",
* e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
* absent, we use a dash ("-").
*
* I was surprised to see range qualifiers with some of the versions provided
* from yarn (<code>yarn list</code>). For now, we just ignore them.
*
* FIXME Sort out what to do with the range qualifiers
*
* FIXME Don't assume that values provided in pURL format are NPM.
*/
if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
$namespace = empty($matches[1]) ? '-' : $matches[1];
return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
}
return null;
}
function loadFromString($content) {
$stream = fopen('php://memory','w+');
fwrite($stream, $content);
rewind($stream);
$results = array();
while ($line = trim(fgets($stream))) {
if (empty($line)) break;
if (preg_match('/$#', $line)) break;
if (preg_match('/$\/\//', $line)) break;
if ($id = normalizeId(trim($line))) {
$results['unmatched'][$id] = array();
} else {
$results['invalid'][] = $line;
}
}
fclose($stream);
return $results;
}
function loadFromPackageLockString($content) {
$json = json_decode($content, true);
$results = array();
visitPackages($json, function($id) use (&$results) {
$results['unmatched'][$id] = array();
});
return $results;
}
function visitPackages($root, Callable $callback) {
if (!isset($root['dependencies'])) return;
foreach($root['dependencies'] as $name => $data) {
$matches = null;
if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
$namespace = $matches[1];
$name = $matches[2];
} else {
$namespace = '-';
$name = $name;
}
$id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
call_user_func($callback, $id);
visitPackages($data, $callback);
}
}
function matchAgainstEclipseProjects(&$results) {
foreach(array_keys($results['unmatched']) as $id) {
if ($parts = preg_split('/\//', $id)) {
if (matchesEclipseNamespace($parts[2])) {
unset($results['unmatched'][$id]);
$results['approved'][$id] = array(
'id' => $id,
'license' => '',
'status' => 'approved',
'sourceUrl' => '',
'definitionUrl' => '',
'authority' => 'eclipse',
'confidence' => 90
);
} elseif (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $parts[2])) {
if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[3])) {
unset($results['unmatched'][$id]);
$results['approved'][$id] = array(
'id' => $id,
'license' => '',
'status' => 'approved',
'sourceUrl' => '',
'definitionUrl' => '',
'authority' => 'eclipse',
'confidence' => 90
);
}
}
}
}
}
function matchesEclipseNamespace($namespace) {
// TODO Make this data-driven
if (preg_match('/^eclipse\b/', $namespace)) return true;
if (preg_match('/^org\.eclipse\b/', $namespace)) return true;
if (preg_match('/^org\.polarsys\b/', $namespace)) return true;
if (preg_match('/^org\.locationtech\b/', $namespace)) return true;
if (preg_match('/^jakarta\b/', $namespace)) return true;
if (preg_match('/^org\.aspectj/', $namespace)) return true;
if (preg_match('/^@theia$/', $namespace)) return true;
return false;
}
/*
* Match against the consolidated data from Eclipse Foundation
* sources. The consolidated data is stored in the dashboard
* database by a script that runs periodically.
*
* This function modifies the parameter. As we find matches, the id
* is removed from the "unmatched" set and the metadata that we do
* find it added to an array by status.
*
* @see project-services/capture/php/import_third_party_license_data.php
*/
function matchAgainstFoundationData(&$results) {
foreach(array_keys($results['unmatched']) as $id) {
$where = array('id=":id"');
$order = array('if(status="approved",0,1)');
/*
* According to the IP Policy, service releases should match either a minor
* release or another service release at the same minor level.
*
* Map an id, expressed as Clearly Defined coordinates into a regular expression
* capable of matching against service releases.
*
* e.g., map "maven/mavencentral/com.github.jnr/jnr-posix/3.0.29" to
* "maven\/mavencentral\/com\.github\.jnr\/jnr\-posix\/3\.0\.[0-9]+"
*
* @param string $id an id in Clearly Defined format
* @param callable Function to call with the values.
*/
$matches = null;
if (preg_match('/^((?:[^\/]+\/){4}(?:\d+\.\d+))(?:\.(\d+))?/', $id, $matches)) {
$like = $matches[1] . '.%';
$regexp = preg_quote($matches[1]) . '\.[0-9]+';
$version = isset($matches[2]) ? $matches[2] : '0';
// regexp is expensive, so test first with like and then regexp.
// This improves performance by about five-fold.
$where[] = "(id like '{$like}' and id regexp '{$regexp}')";
// Order results so that the one with the service release number that's closest
// to the one we want is at the top of the list.
$order[] = "abs(substring_index(substring_index(id,'/',-1),'.',-1) - {$version})";
}
$whereClause = implode(' OR ', $where);
$orderList = implode(', ', $order);
// There may be multiple hits for a particular id. The query only
// returns one row, preferring rows that represent 'approved'
// content.
$sql = "
select
id, license, status, sourceUrl, definitionUrl, authority, confidence
from ThirdPartyLicense
where $whereClause
order by $orderList
limit 1";
$args = array(':id' => $id);
query('dashboard', $sql, $args, function($row) use (&$results, $id) {
$row['id'] = $id;
unset($results['unmatched'][$id]);
$results[$row['status']][$id] = $row;
});
}
}
// Everything above this line could (and probably should) be factored out.
header ("Content-type: text/csv");
header ("Content-Disposition: \"inline; filename=licenses.csv\"");
require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
$App = new App ();
require_once dirname(__FILE__) . "/../classes/database.inc";
if ($content = @$_POST['content']) {
$results = loadFromString($content);
} elseif ($content = @$_POST['package-lock']) {
$results = loadFromPackageLockString($content);
} elseif ($content = @$_POST['json']) {
$results = array('unmatched' => array_fill_keys(json_decode($content, true),array()));
}
if ($results) {
matchAgainstEclipseProjects($results);
matchAgainstFoundationData($results);
} else {
$results = array();
}
echo json_encode($results);
?>