blob: 393859f0a3845f05e30b431d18535373ff35f588 [file] [log] [blame]
<?php
/*******************************************************************************
* Copyright (c) 2019 Eclipse Foundation and others.
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*******************************************************************************/
/*
* This file is a prototype for executing a license scan of a list of
* content ids. The list can be provided in a few different forms:
*
* - When a "request" field is provided in the POST data, its value is
* parsed as a JSON structure that contains a "dependencies" field that is
* a JSON array of strings, with each string representing one unit of content,
* along with a "project" field that is a string containing a project id;
*
* - When a "content" field is provided in the POST data, its value is
* parsed as plain text with each line containing the one unit of content
* (blank lines and comment lines are skipped; comment lines start with
* "#" or "//");
*
* - When a "json" field is provided in the POST data, its value is parsed
* as a JSON array of strings, with each string representing one unit of
* content; or
*
* - When an "id" field is provided in the GET data, it is parsed as a
* single unit of content (i.e., a list of one item)
*
* The id for a unit of content may be expressed as Maven coordinates of the form
* "groupid:artifactid[:packaging]:version", as NPMJS coordinates of the form
* "type/namespace/name@version", or as ClearlyDefined coordinates of the
* form "type/source/namespace/name/version". Mixing formats is supported.
*
* Output is expressed as ClearlyDefined ids.
*
* Two sources of information are used to map ids to license information and other
* metadata: first, the Eclipse Foundation data is consulted and then
* the ClearlyDefined services are called. Future versions of this script
* may consult other sources of data.
*
* usage examples:
*
* curl -X POST http://localhost/projects/services/license_check.php \
* --data-urlencode content@maven.deps
*
* curl -X POST http://localhost/projects/services/license_check.php \
* -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
*
* curl -X POST http://www.eclipse.org/projects/services/license_check.php \
* -d $'json=["npm/npmjs/@theia/variable-resolver/0.3.19","npm/npmjs/@theia/outline-view/0.3.19"]'
*
* curl -X POST http://localhost/projects/services/license_check.php \
* -d $'request={"project":"ecd.theia", "dependencies": ["npm/npmjs/@theia/variable-resolver/0.3.19","npm/npmjs/@theia/outline-view/0.3.19"]}'
*
* curl -X POST "http://localhost/projects/services/license_check.php" \
* -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
*
* yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
* | curl -X POST "http://localhost/projects/services/license_check.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
* --data-urlencode content@- | jsonpp | less
*
* Note that this works with an instance running on localhost.
*
* TODO Validate that we can scale to ~4K lines
* TODO support file upload.
* TODO Customize license list based on project (e.g., science.* can use LGPL)
* TODO Parameters for default type and provider.
*/
require_once dirname(__FILE__) . '/../classes/common.php';
require_once dirname(__FILE__) . '/../classes/Project.class.php';
require_once dirname(__FILE__) . '/../classes/License.class.inc';
require_once dirname(__FILE__) . '/../classes/ProjectContentIdMapper.class.inc';
/**
* Try to massage the content identifier into ClearlyDefined coordinates. That is,
* for example, recognize Maven coordinates of the form <em>groupid:artifactid:version</em>, and
* convert them into the ClearlyDefined equivalent, <em>maven:mavencentral:groupid:artifactid:version</em>.
*
* Answers <code>null<code> when the id is not recognized.
*
* @param string $id
* @return NULL|string
*/
function normalizeId($id) {
$matches = null;
// Just pass through anything that's already in a ClearlyDefined coordinate form.
if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
/*
* Deal with Maven coordinates. There is a special case that we need to deal with
* because of Tycho using p2 repositories. When the coordinates start with
* "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
*/
if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
/*
* Some p2 cases (org.apache.ant only, I think) provide all of the information that
* we need to match against Maven directly.
*
* e.g. "p2.eclipse-plugin:org.apache.ant:jar:lib/ant-jsch.jar:1.10.5.v20190526-1402"
* maps to "maven/mavencentral/org.apache.ant/ant-jsch/1.10.5"
*/
if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
/*
* Deal with NPMJS coordinates. These take the form "namespace/name@version",
* e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
* absent, we use a dash ("-").
*
* I was surprised to see range qualifiers with some of the versions provided
* from yarn (<code>yarn list</code>). For now, we just ignore them.
*
* FIXME Sort out what to do with the range qualifiers
*
* FIXME Don't assume that values provided in NPM format are NPM.
*/
if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
$namespace = empty($matches[1]) ? '-' : $matches[1];
return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
}
return null;
}
function loadFromString($content) {
$stream = fopen('php://memory','w+');
fwrite($stream, $content);
rewind($stream);
$results = array();
while ($line = trim(fgets($stream))) {
if (empty($line)) break;
if (preg_match('/$#', $line)) break;
if (preg_match('/$\/\//', $line)) break;
if ($id = normalizeId(trim($line))) {
$results['unmatched'][$id] = array();
} else {
$results['invalid'][] = $line;
}
}
fclose($stream);
return $results;
}
function loadFromPackageLockString($content) {
$json = json_decode($content, true);
$results = array();
visitPackages($json, function($id) use (&$results) {
$results['unmatched'][$id] = array();
});
return $results;
}
function visitPackages($root, $callback) {
if (!isset($root['dependencies'])) return;
foreach($root['dependencies'] as $name => $data) {
$matches = null;
if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
$namespace = $matches[1];
$name = $matches[2];
} else {
$namespace = '-';
$name = $name;
}
$id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
call_user_func($callback, $id);
visitPackages($data, $callback);
}
}
function matchAgainstEclipseProjects(&$results) {
foreach(array_keys($results['unmatched']) as $id) {
if ($projectId = ProjectContentIdMapper::getEclipseProjectFor($id)) {
// Attempt to determine the Eclipse Project id from the
// ClearlyDefined coordinates. If we can identify the project
// then we can provide more information back.
$licenses = License::getLicensesForProject($projectId);
$spdx = License::getSPDXExpression($licenses);
unset($results['unmatched'][$id]);
$results['approved'][$id] = array(
'id' => $id,
'license' => $spdx,
'status' => 'approved',
'sourceUrl' => '',
'definitionUrl' => '',
'authority' => $projectId,
'confidence' => 100
);
} else {
// $parts = explode('/', $id);
// if (count($parts) == 5) {;
// list($type, $source, $namespace, $name, $revision) = $parts;
// if (preg_match('/^org\.eclipse\./', $name)) {
// unset($results['unmatched'][$id]);
// $results['approved'][$id] = array(
// 'id' => $id,
// 'license' => '',
// 'status' => 'approved',
// 'sourceUrl' => '',
// 'definitionUrl' => '',
// 'authority' => 'eclipse',
// 'confidence' => 90
// );
// }
// }
}
}
}
/**
* Private function.
*
* Match against the consolidated data from Eclipse Foundation
* sources. The consolidated data is stored in the dashboard
* database by a script that runs periodically.
*
* This function modifies the parameter. As we find matches, the id
* is removed from the "unmatched" set and the metadata that we do
* find it added to an array by status.
*
* <p>The <code>ThirdPartyLicenseException</code> table is created and maintained by the
* <code>import_third_party_license_data.php</code> "project service" script.
*/
function matchAgainstFoundationData(&$results, $project=null) {
foreach(array_keys($results['unmatched']) as $id) {
$real = preg_replace('/p2\.eclipse\.(plugin|feature)/', 'p2.eclipse-$1', $id);
$match = findBestMatch($real);
// If we don't find something, or the something that we do find
// is restricted, then look for an exception.
if ($match == null || $match['status'] == 'restricted') {
if ($exception = findBoardException($real, $project)) {
$match = $exception;
}
}
if ($match) {
if (strcmp($id,$real) != 0) {
$match['actual'] = $real;
$results['log'][] = "{$id} => {$real}";
}
$match['id'] = $id;
unset($results['unmatched'][$id]);
$results[$match['status']][$id] = $match;
}
}
}
/**
* Private function.
*
* Attempt to find a match for a particular artifact in the Eclipse Foundation
* intellectual property database.
*
* @param string $id
* @return mixed|NULL the matching row or NULL if there is no match.
*/
function findBestMatch($id) {
$where = array('id=":id"');
$order = array('if(status="approved",0,1)');
/*
* According to the IP Policy, service releases should match either a minor
* release or another service release at the same minor level.
*
* Map an id, expressed as Clearly Defined coordinates into a regular expression
* capable of matching against service releases.
*
* e.g., map "maven/mavencentral/com.github.jnr/jnr-posix/3.0.29" to
* "maven\/mavencentral\/com\.github\.jnr\/jnr\-posix\/3\.0\.[0-9]+"
*
* @param string $id an id in Clearly Defined format
* @param callable Function to call with the values.
*/
$matches = null;
if (preg_match('/^((?:[^\/]+\/){4}v?(?:\d+\.\d+))(?:\.(\d+))?/', $id, $matches)) {
$like = $matches[1] . '.%';
$regexp = preg_quote($matches[1]) . '\.[0-9]+';
$version = isset($matches[2]) ? $matches[2] : '0';
// regexp is expensive, so test first with like and then regexp.
// This improves performance by about five-fold.
$where[] = "(id like '{$like}' and id regexp '{$regexp}')";
// Order results so that the one with the service release number that's closest
// to the one we want is at the top of the list.
$order[] = "abs(substring_index(substring_index(id,'/',-1),'.',-1) - {$version})";
}
$whereClause = implode(' OR ', $where);
$orderList = implode(', ', $order);
// There may be multiple hits for a particular id. The query only
// returns one row, preferring rows that represent 'approved'
// content.
$sql = "
select
id, license, status, sourceUrl, definitionUrl, authority, confidence
from ThirdPartyLicense
where $whereClause
order by $orderList
limit 1";
$args = array(':id' => $id);
$rows = array();
query('dashboard', $sql, $args, function($row) use (&$rows, $id) {
$row['id'] = $id;
$rows[] = $row;
});
return reset($rows);
}
/**
* Private function.
*
* Attempt to find a exception granted by the Eclipse Foundation Board of
* Directors that permits the project to use a particular bit of content.
*
* <p>There are several ways that
* an exception can be granted. The different ways are represented as the "rule"
* in the <code>ThirdPartyLicenseException</code> table. The rules are handled
* as follows:
*
* <ul>
* <li><em>project</em> - Allow a single specific project to use a specific
* version of a specific item;</li>
* <li><em>project_all</em> - Allow a single specific project to use all versions
* of a specific item item;</li>
* <li><em>project_all_future</em> - Allow a single specific project to use a specific
* version, and all future versions, of a specific item;</li>
* <li><em>all</em> - Allow all projects to use all versions of a specific item;</li>
* </ul>
*
* <p>Additional rules may be added in the future.
*
* <p>The <code>ThirdPartyLicenseException</code> table is created and maintained by the
* <code>import_third_party_license_data.php</code> "project service" script.
*
* @param string $id a ClearlyDefined Id.
* @param string $project a project id in the standard format (e.g., "technology.dash").
* @return mixed|NULL the matching row or NULL if there is no match.
*/
function findBoardException($id, $project) {
if ($project) {
if (preg_match('/^((?:[^\/]+\/){4})/', $id, $matches)) {
$pattern = $matches[1] . '%';
// TODO Confirm that this orders such that the best candidate is at the top.
// For now, we put an exact match at the top when one exists do a secondary sort
// that should put the most recent version at the top.
$sql = "
select
l.id, l.license,
'approved' as status,
l.sourceUrl, l.definitionUrl,
l.authority, l.confidence,
e.project, e.rule
from ThirdPartyLicense as l, ThirdPartyLicenseException as e
where
(e.rule = 'project' and e.project=':project' and e.id=':id' and e.id=l.id)
or (e.rule in ('project_all','project_all_future','workswith') and e.project=':project' and e.id like ':pattern' and e.id=l.id)
or (e.rule='all' and e.id like ':pattern' and e.id=l.id)
order by if(e.id=':id',0,1) asc, id desc
limit 1";
$args = array(':id' => $id, ':project' => $project, ':pattern' => $pattern);
$rows = array();
query('dashboard', $sql, $args, function($row) use (&$rows, $id) {
$row['id'] = $id;
$rows[] = $row;
});
return reset($rows);
}
}
return null;
}
// Everything above this line could (and probably should) be factored out.
header ("Content-type: text/csv");
header ("Content-Disposition: \"inline; filename=licenses.csv\"");
require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
$App = new App ();
require_once dirname(__FILE__) . "/../classes/database.inc";
$projectId = isset($_POST['project']) ? $_POST['project'] : (isset($_GET['project']) ? $_GET['project'] : null);
$results = null;
foreach($argv as $arg) {
$matches = null;
if (preg_match('/^id=(?<id>.*)$/', $arg, $matches)) {
$results = array('unmatched' => array($matches['id'] => array()));
}
}
if ($json = @$_POST['request']) {
$results = array();
$request = json_decode($json, true);
if (isset($request['project']) && $request['project']) {
$projectId = $request['project'];
}
$results['unmatched'] = array_fill_keys($request['dependencies'],array());
} elseif ($content = @$_GET['id']) {
$results = array('unmatched' => array($content => array()));
} elseif ($content = @$_POST['content']) {
$results = loadFromString($content);
} elseif ($content = @$_POST['package-lock']) {
$results = loadFromPackageLockString($content);
} elseif ($content = @$_POST['json']) {
$results = array('unmatched' => array_fill_keys(json_decode($content, true),array()));
}
if ($projectId) {
if (isValidProjectId($projectId)) {
if ($project = Project::getProject($projectId)) {
$results['project'] = array(
"id" => $project->getId(),
"name" => $project->getFormalName(),
"license" => License::getSPDXExpression(License::getLicensesForProject($projectId))
);
} else {
$results['errors'][] = array("errorInvalidProjectId" => "The project id does not match an existing project");
}
} else {
$results['errors'][] = array("errorMalformedProjectId" => "The project id is not well-formed");
}
}
if ($results) {
matchAgainstEclipseProjects($results);
matchAgainstFoundationData($results, $projectId);
} else {
$results = array();
}
echo json_encode($results);
?>