#!/usr/bin/perl
use CGI::Util;
use Encode qw(_utf8_on); # Perl unicode hacking.
use HTML::Entities qw(encode_entities_numeric);
use LWP::Simple;
# Manual mappings to Library of Congress Subject Headings.
# I fill these in when I have the time.
my $lcsh = {
'110' => 'sh85084286',
'111' => 'sh85094833',
'113' => 'sh85033169',
'114' => 'sh2006003964',
'115' => 'sh85135395',
'116' => 'sh85022510',
'118' => 'sh85050452',
'470' => 'sh85068863',
'471' => 'sh85074944',
'472' => 'sh85074948',
'473' => 'sh87008049',
'475' => 'sh85074950',
'477' => 'sh85074964',
'641' => 'sh85050184',
'642' => 'sh85038076',
'643' => 'sh85062635',
'645' => 'sh85062549',
'646' => 'sh85120542',
};
$lcshCount = scalar( keys %$lcsh );
use DateTime;
my $now = DateTime->now->ymd;
open XML, ">decimalised.rdf";
open HTML, ">decimalised.html";
print XML <<EOF;
<rdf:RDF xml:lang="en" xml:base="http://purl.org/NET/decimalised"
xmlns="http://www.w3.org/2004/02/skos/core#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/terms/"
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:void="http://rdfs.org/ns/void#">
<ConceptScheme rdf:about="#d" rdf:type="http://rdfs.org/ns/void#Dataset">
<dc:title>Decimalised Database of Concepts</dc:title>
<foaf:homepage rdf:resource="http://purl.org/NET/decimalised" />
<dc:abstract>The decimalised database of concepts is a collection of topics suitable for use in linked data. </dc:abstract>
<dc:description>The decimalised database of concepts is a collection of topics suitable for use in linked data. It is inspired by the Dewey Decimal Classification, but no guarantees are made about the closeness of its resemblance as a whole. SKOS mapping links are provided from this database to the Dewey system, to Library of Congree Classification codes and to DBPedia resources where possible.</dc:description>
<hasTopConcept rdf:resource="#000" />
<hasTopConcept rdf:resource="#100" />
<hasTopConcept rdf:resource="#200" />
<hasTopConcept rdf:resource="#300" />
<hasTopConcept rdf:resource="#400" />
<hasTopConcept rdf:resource="#500" />
<hasTopConcept rdf:resource="#600" />
<hasTopConcept rdf:resource="#700" />
<hasTopConcept rdf:resource="#800" />
<hasTopConcept rdf:resource="#900" />
<dc:subject rdf:resource="http://dbpedia.org/resource/Categorization" />
<dc:subject rdf:resource="#c025.431" />
<dc:creator>
<foaf:Person rdf:about="http://tobyinkster.co.uk/#i">
<foaf:name>Toby Inkster</foaf:name>
<foaf:homepage rdf:resource="http://tobyinkster.co.uk/" />
</foaf:Person>
</dc:creator>
<dc:source rdf:resource="http://en.wikipedia.org/wiki/List_of_Dewey_Decimal_classes" />
<dc:source rdf:resource="http://en.wikipedia.org/wiki/Comparison_of_Dewey_and_Library_of_Congress_subject_classification" />
<dc:relation rdf:resource="http://purl.org/dc/terms/DDC" />
<dc:created>2009-03-24</dc:created>
<dc:modified>$now</dc:modified>
<dc:license rdf:resource="http://creativecommons.org/licenses/by-sa/3.0/" />
<void:exampleResource rdf:resource="#c005" />
<void:exampleResource rdf:resource="#c025.431" />
<void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}(\.[0-9]+)?</void:uriRegexPattern>
<void:subset>
<void:Dataset rdf:about="#d-integral">
<dc:description>The subset of <#d> with integer numbers.</dc:description>
<dc:isPartOf rdf:resource="#d" />
<void:exampleResource rdf:resource="#c005" />
<void:exampleResource rdf:resource="#c999" />
<void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}</void:uriRegexPattern>
</void:Dataset>
</void:subset>
<void:subset>
<void:Dataset rdf:about="#d-fractional">
<dc:isPartOf rdf:resource="#d" />
<dc:description>The subset of <#d> with non-integer numbers.</dc:description>
<rdfs:comment>Although not currently listed, terms with decimal points (such as <#c025.431>) are considered to be defined by this document.</rdfs:comment>
<void:exampleResource rdf:resource="#c025.431" />
<void:exampleResource rdf:resource="#c523.4" />
<void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}\.[0-9]+</void:uriRegexPattern>
</void:Dataset>
</void:subset>
<void:subset>
<void:Linkset rdf:about="#d-linksto-dewey">
<dc:isPartOf rdf:resource="#d" />
<rdfs:comment>All integral Decimalised Database of Concepts terms include links to related Dewey Decimal Classification terms.</rdfs:comment>
<void:subjectsTarget rdf:resource="#d" />
<void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#exactMatch" />
<void:objectsTarget>
<void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/dewey-decimal">
<foaf:homepage rdf:resource="http://www.oclc.org/dewey/" />
<void:uriRegexPattern>info:ddc/22/eng//.+</void:uriRegexPattern>
</void:Dataset>
</void:objectsTarget>
</void:Linkset>
</void:subset>
<void:subset>
<void:Linkset rdf:about="#d-linksto-lcc">
<dc:isPartOf rdf:resource="#d" />
<rdfs:comment>Many integral Decimalised Database of Concepts terms include links to related Library of Congress Classifcation terms.</rdfs:comment>
<void:subjectsTarget rdf:resource="#d" />
<void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#closeMatch" />
<void:objectsTarget>
<void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/library-of-congress-classifications">
<foaf:homepage rdf:resource="http://inkdroid.org/lcco/" />
<void:uriRegexPattern>http://inkdroid.org/lcco/.+</void:uriRegexPattern>
</void:Dataset>
</void:objectsTarget>
</void:Linkset>
</void:subset>
<void:subset>
<void:Linkset rdf:about="#d-linksto-lcsh">
<dc:isPartOf rdf:resource="#d" />
<rdfs:comment>Some integral Decimalised Database of Concepts terms include links to related Library of Congress Subject Headings. These are added manually. There are currently $lcshCount such links.</rdfs:comment>
<void:subjectsTarget rdf:resource="#d" />
<void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#closeMatch" />
<void:objectsTarget>
<void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/library-of-congress-subject-headings">
<foaf:homepage rdf:resource="http://id.loc.gov/" />
<void:uriRegexPattern>http://id.loc.gov/authorities/.+#concept</void:uriRegexPattern>
</void:Dataset>
</void:objectsTarget>
</void:Linkset>
</void:subset>
<void:subset>
<void:Linkset rdf:about="#d-linksto-dbpedia">
<dc:isPartOf rdf:resource="#d" />
<rdfs:comment>Many integral Decimalised Database of Concepts terms include links to related DBPedia resources.</rdfs:comment>
<void:subjectsTarget rdf:resource="#d" />
<void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#relatedMatch" />
<void:objectsTarget>
<void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/dbpedia">
<foaf:homepage rdf:resource="http://dbpedia.org/" />
<void:uriRegexPattern>http://dbpedia.org/resource/.+</void:uriRegexPattern>
</void:Dataset>
</void:objectsTarget>
</void:Linkset>
</void:subset>
<void:feature>
<void:TechnicalFeature rdf:about="#d-feature-rdfxml">
<dc:format>application/rdf+xml</dc:format>
</void:TechnicalFeature>
</void:feature>
<void:feature>
<void:TechnicalFeature rdf:about="#d-feature-ntriples">
<dc:format>text/plain</dc:format>
</void:TechnicalFeature>
</void:feature>
<void:feature>
<void:TechnicalFeature rdf:about="#d-feature-rdfa">
<dc:format>text/html</dc:format>
<rdfs:comment>Contains only a subset of the information in the other versions.</rdfs:comment>
</void:TechnicalFeature>
</void:feature>
<void:dataDump rdf:resource="http://purl.org/NET/decimalised" />
<void:vocabulary rdf:resource="http://www.w3.org/2004/02/skos/core#" />
</ConceptScheme>
<rdf:Description rdf:about="#number">
<rdfs:label>Number</rdfs:label>
<rdfs:comment>A number below 1000 to identify a resource within a scheme.</rdfs:comment>
</rdf:Description>
EOF
print HTML <<EOF;
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xml:lang="en" version="XHTML+RDFa 1.0"
xmlns="http://www.w3.org/1999/xhtml"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:s="http://www.w3.org/2004/02/skos/core#"
xmlns:d="http://purl.org/dc/terms/">
<!--
The prefixes above may be unusual, but 'skos' and 'dcterms'
would add an extra 15000 bytes to this file!
-->
<head profile="http://www.w3.org/1999/xhtml/vocab">
<title property="d:title">Decimalised Database of Concepts</title>
<base href="http://purl.org/NET/decimalised" />
</head>
<body about="#d" typeof="s:ConceptScheme">
<h1 property="d:title">Decimalised Database of Concepts</h1>
<p about="decimalised.html" property="rdfs:comment" datatype=""><strong>This page is marked up with XHTML+RDFa 1.0, but the <a rel="meta" href="decimalised.rdf">RDF/XML</a> and <a rel="meta" href="decimalised.nt">N-Triples</a> versions include much more information.</strong></p>
<h2>Concepts</h2>
<ul rev="s:inScheme">
EOF
my $lccPage = &get('http://en.wikipedia.org/w/index.php?title=Comparison_of_Dewey_and_Library_of_Congress_subject_classification&action=raw');
_utf8_on($lccPage);
my @lccLines = split /\r?\n/, $lccPage;
my $lccLinks;
my $lccEquiv;
my $lccTitle;
foreach (@lccLines)
{
next unless /^\|(\d\d\d)\|\|([^\|]+)\|\|(.+)$/;
my ($num, $equiv, $t) = ($1, $2, $3);
my @links;
$t =~ s/'{2,3}//g;
$t =~ s/\[\[(.+?)\]\]/my ($uri, $label) = split m#\|#, $1; $label ||= $uri; push @links, $uri;$label/eg;
$t =~ s/(^\s+)|(\s+$)//g;
$lccLinks->{$num} = \@links;
$lccEquiv->{$num} = $equiv;
$lccTitle->{$num} = $t;
}
my @defs;
my @defsH;
my $ddcPage = &get('http://en.wikipedia.org/w/index.php?title=Outline_of_Dewey_Decimal_classes&action=raw');
_utf8_on($ddcPage);
my @ddcLines = split /\r?\n/, $ddcPage;
foreach (@ddcLines)
{
# Remove some MediaWiki formatting.
s/'{2,3}//g;
# Remove links, but retain them.
my @links;
s/\[\[(.+?)\]\]/my ($uri, $label) = split m#\|#, $1; $label ||= $uri; push @links, $uri;$label/eg;
# Some codings are unassigned.
next if /(un|not )assigned/i;
# Only interested in these.
next unless /^\*\*?\s*(\d{3})\s+(.+)$/;
my $num = $1;
my $topic = $2;
$topic =~ s/(^\s+)|(\s+$)//g;
my $optionalLink = '';
$optionalLink = " <a href=\"http://id.loc.gov/authorities/".encode_entities_numeric($lcsh->{$num})."#concept\" rel=\"s:closeMatch\">(lcsh)</a>"
if defined $lcsh->{$num};
$defsH[$num] = "\t\t<li about=\"#c${num}\" typeof=\"s:Concept\"><b property=\"d:identifier s:notation\" rel=\"s:exactMatch\" resource=\"info:ddc/22/eng//${num}\">${num}</b> <span property=\"s:prefLabel\">".encode_entities_numeric($topic)."</span>$optionalLink</li>\n";
my $d = "\t<Concept rdf:about=\"#c${num}\" dc:identifier=\"${num}\">\n";
$d .= "\t\t<notation rdf:datatype=\"#number\">${num}</notation>\n";
$d .= "\t\t<inScheme rdf:resource=\"#d\" />\n";
$d .= "\t\t<prefLabel>".encode_entities_numeric($topic)."</prefLabel>\n";
$d .= "\t\t<altLabel>".encode_entities_numeric($lccTitle->{$num})."</altLabel>\n"
if (defined $lccTitle->{$num}) && ((lc$lccTitle->{$num}) ne (lc $topic));
if ($num =~ /^(\d)(\d)([1-9])$/)
{ $d .= "\t\t<broader rdf:resource=\"#c${1}${2}0\" />\n"; }
elsif ($num =~ /^(\d)([1-9])(0)$/)
{ $d .= "\t\t<broader rdf:resource=\"#c${1}00\" />\n"; }
$d .= "\t\t<exactMatch rdf:resource=\"http://dewey.info/class/${num}/\" />\n";
$d .= "\t\t<exactMatch rdf:resource=\"info:ddc/22/eng//${num}\" />\n";
$d .= "\t\t<closeMatch rdf:resource=\"http://inkdroid.org/lcco/".encode_entities_numeric($lccEquiv->{$num})."\" />\n"
if defined $lccEquiv->{$num};
$d .= "\t\t<closeMatch rdf:resource=\"http://id.loc.gov/authorities/".encode_entities_numeric($lcsh->{$num})."#concept\" />\n"
if defined $lcsh->{$num};
push @links, @{ $lccLinks->{$num} };
my %already = ();
my $d2 = '';
foreach my $l (@links)
{
$l =~ s/ /_/g;
$l = CGI::Util::escape(ucfirst($l));
next if $already{lc $l};
$d2 .= "\t\t<foaf:page rdf:resource=\"http://en.wikipedia.org/wiki/".encode_entities_numeric($l)."\" />\n";
$d .= "\t\t<relatedMatch rdf:resource=\"http://dbpedia.org/resource/".encode_entities_numeric($l)."\" />\n";
$already{lc $l} = 1;
}
$d .= $d2; # This is just forces the Wikipedia links to the end of the XML block.
$d .= "\t</Concept>\n\n";
$defs[$num] = $d;
}
foreach (@defs)
{
print XML $_;
}
foreach (@defsH)
{
print HTML $_;
}
print XML "</rdf:RDF>\n";
print HTML <<EOF;
</ul>
<address about="#d">
<span rel="d:creator"><a xmlns:foaf="http://xmlns.com/foaf/0.1/" typeof="foaf:Person" about="http://tobyinkster.co.uk/#i" rel="foaf:homepage" property="foaf:name" href="http://tobyinkster.co.uk/">Toby Inkster</a></span>
| <span property="d:modified">$now</span>
| <a rel="d:license" href="http://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA</a>
</address>
</body>
</html>
EOF
close XML;
close HTML;
system('rapper decimalised.rdf | sort > decimalised.nt');