#!/usr/bin/perl

use CGI::Util;
use Encode qw(_utf8_on); # Perl unicode hacking.
use HTML::Entities qw(encode_entities_numeric);
use LWP::Simple;

# Manual mappings to Library of Congress Subject Headings.
# I fill these in when I have the time.
my $lcsh = {
   '110' => 'sh85084286',
   '111' => 'sh85094833',
   '113' => 'sh85033169',
   '114' => 'sh2006003964',
   '115' => 'sh85135395',
   '116' => 'sh85022510',
   '118' => 'sh85050452',
   '470' => 'sh85068863',
   '471' => 'sh85074944',
   '472' => 'sh85074948',
   '473' => 'sh87008049',
   '475' => 'sh85074950',
   '477' => 'sh85074964',
   '641' => 'sh85050184',
   '642' => 'sh85038076',
   '643' => 'sh85062635',
   '645' => 'sh85062549',
   '646' => 'sh85120542',
   };
$lcshCount = scalar( keys %$lcsh );

use DateTime;
my $now = DateTime->now->ymd;

open XML, ">decimalised.rdf";
open HTML, ">decimalised.html";

print XML <<EOF;
<rdf:RDF xml:lang="en" xml:base="http://purl.org/NET/decimalised"
    xmlns="http://www.w3.org/2004/02/skos/core#"
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:dc="http://purl.org/dc/terms/"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
    xmlns:void="http://rdfs.org/ns/void#">
    
   <ConceptScheme rdf:about="#d" rdf:type="http://rdfs.org/ns/void#Dataset">
      <dc:title>Decimalised Database of Concepts</dc:title>
      <foaf:homepage rdf:resource="http://purl.org/NET/decimalised" />
      <dc:abstract>The decimalised database of concepts is a collection of topics suitable for use in linked data. </dc:abstract>
      <dc:description>The decimalised database of concepts is a collection of topics suitable for use in linked data. It is inspired by the Dewey Decimal Classification, but no guarantees are made about the closeness of its resemblance as a whole. SKOS mapping links are provided from this database to the Dewey system, to Library of Congree Classification codes and to DBPedia resources where possible.</dc:description>
      <hasTopConcept rdf:resource="#000" />     
      <hasTopConcept rdf:resource="#100" />
      <hasTopConcept rdf:resource="#200" />
      <hasTopConcept rdf:resource="#300" />
      <hasTopConcept rdf:resource="#400" />
      <hasTopConcept rdf:resource="#500" />
      <hasTopConcept rdf:resource="#600" />
      <hasTopConcept rdf:resource="#700" />
      <hasTopConcept rdf:resource="#800" />
      <hasTopConcept rdf:resource="#900" />
      <dc:subject rdf:resource="http://dbpedia.org/resource/Categorization" />
      <dc:subject rdf:resource="#c025.431" />
      <dc:creator>
         <foaf:Person rdf:about="http://tobyinkster.co.uk/#i">
            <foaf:name>Toby Inkster</foaf:name>
            <foaf:homepage rdf:resource="http://tobyinkster.co.uk/" />
         </foaf:Person>
      </dc:creator>
      <dc:source rdf:resource="http://en.wikipedia.org/wiki/List_of_Dewey_Decimal_classes" />
      <dc:source rdf:resource="http://en.wikipedia.org/wiki/Comparison_of_Dewey_and_Library_of_Congress_subject_classification" />
      <dc:relation rdf:resource="http://purl.org/dc/terms/DDC" />
      <dc:created>2009-03-24</dc:created>
      <dc:modified>$now</dc:modified>
      <dc:license rdf:resource="http://creativecommons.org/licenses/by-sa/3.0/" />     
      <void:exampleResource rdf:resource="#c005" />
      <void:exampleResource rdf:resource="#c025.431" />
      <void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}(\.[0-9]+)?</void:uriRegexPattern>
      <void:subset>
         <void:Dataset rdf:about="#d-integral">
            <dc:description>The subset of &lt;#d&gt; with integer numbers.</dc:description>
            <dc:isPartOf rdf:resource="#d" />
            <void:exampleResource rdf:resource="#c005" />
            <void:exampleResource rdf:resource="#c999" />
            <void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}</void:uriRegexPattern>
         </void:Dataset>
      </void:subset>
      <void:subset>
         <void:Dataset rdf:about="#d-fractional">
            <dc:isPartOf rdf:resource="#d" />
            <dc:description>The subset of &lt;#d&gt; with non-integer numbers.</dc:description>
            <rdfs:comment>Although not currently listed, terms with decimal points (such as &lt;#c025.431&gt;) are considered to be defined by this document.</rdfs:comment>
            <void:exampleResource rdf:resource="#c025.431" />
            <void:exampleResource rdf:resource="#c523.4" />
            <void:uriRegexPattern>http://purl.org/NET/decimalised#c[0-9]{3}\.[0-9]+</void:uriRegexPattern>
         </void:Dataset>
      </void:subset>
      <void:subset>
         <void:Linkset rdf:about="#d-linksto-dewey">
            <dc:isPartOf rdf:resource="#d" />
            <rdfs:comment>All integral Decimalised Database of Concepts terms include links to related Dewey Decimal Classification terms.</rdfs:comment>
            <void:subjectsTarget rdf:resource="#d" />
            <void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#exactMatch" />
            <void:objectsTarget>
               <void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/dewey-decimal">
                  <foaf:homepage rdf:resource="http://www.oclc.org/dewey/" />
                  <void:uriRegexPattern>info:ddc/22/eng//.+</void:uriRegexPattern>
               </void:Dataset>
            </void:objectsTarget>
         </void:Linkset>
      </void:subset>
      <void:subset>
         <void:Linkset rdf:about="#d-linksto-lcc">
            <dc:isPartOf rdf:resource="#d" />
            <rdfs:comment>Many integral Decimalised Database of Concepts terms include links to related Library of Congress Classifcation terms.</rdfs:comment>
            <void:subjectsTarget rdf:resource="#d" />
            <void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#closeMatch" />
            <void:objectsTarget>
               <void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/library-of-congress-classifications">
                  <foaf:homepage rdf:resource="http://inkdroid.org/lcco/" />
                  <void:uriRegexPattern>http://inkdroid.org/lcco/.+</void:uriRegexPattern>
               </void:Dataset>
            </void:objectsTarget>
         </void:Linkset>
      </void:subset>
      <void:subset>
         <void:Linkset rdf:about="#d-linksto-lcsh">
            <dc:isPartOf rdf:resource="#d" />
            <rdfs:comment>Some integral Decimalised Database of Concepts terms include links to related Library of Congress Subject Headings. These are added manually. There are currently $lcshCount such links.</rdfs:comment>
            <void:subjectsTarget rdf:resource="#d" />
            <void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#closeMatch" />
            <void:objectsTarget>
               <void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/library-of-congress-subject-headings">
                  <foaf:homepage rdf:resource="http://id.loc.gov/" />
                  <void:uriRegexPattern>http://id.loc.gov/authorities/.+#concept</void:uriRegexPattern>
               </void:Dataset>
            </void:objectsTarget>
         </void:Linkset>
      </void:subset>
      <void:subset>
         <void:Linkset rdf:about="#d-linksto-dbpedia">
            <dc:isPartOf rdf:resource="#d" />
            <rdfs:comment>Many integral Decimalised Database of Concepts terms include links to related DBPedia resources.</rdfs:comment>
            <void:subjectsTarget rdf:resource="#d" />
            <void:linkPredicate rdf:resource="http://www.w3.org/2004/02/skos/core#relatedMatch" />
            <void:objectsTarget>
               <void:Dataset rdf:about="tag:ontologi.es,2009-05-01:decimalised/related-datasets/dbpedia">
                  <foaf:homepage rdf:resource="http://dbpedia.org/" />
                  <void:uriRegexPattern>http://dbpedia.org/resource/.+</void:uriRegexPattern>
               </void:Dataset>
            </void:objectsTarget>
         </void:Linkset>
      </void:subset>
      <void:feature>
         <void:TechnicalFeature rdf:about="#d-feature-rdfxml">
            <dc:format>application/rdf+xml</dc:format>
         </void:TechnicalFeature>
      </void:feature>
      <void:feature>
         <void:TechnicalFeature rdf:about="#d-feature-ntriples">
            <dc:format>text/plain</dc:format>
         </void:TechnicalFeature>
      </void:feature>
      <void:feature>
         <void:TechnicalFeature rdf:about="#d-feature-rdfa">
            <dc:format>text/html</dc:format>
            <rdfs:comment>Contains only a subset of the information in the other versions.</rdfs:comment>
         </void:TechnicalFeature>
      </void:feature>
      <void:dataDump rdf:resource="http://purl.org/NET/decimalised" />
      <void:vocabulary rdf:resource="http://www.w3.org/2004/02/skos/core#" />
   </ConceptScheme>

   <rdf:Description rdf:about="#number">
      <rdfs:label>Number</rdfs:label>
      <rdfs:comment>A number below 1000 to identify a resource within a scheme.</rdfs:comment>
   </rdf:Description>

EOF

print HTML <<EOF;
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
    "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xml:lang="en" version="XHTML+RDFa 1.0"
    xmlns="http://www.w3.org/1999/xhtml"
    xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
    xmlns:s="http://www.w3.org/2004/02/skos/core#"
    xmlns:d="http://purl.org/dc/terms/">
<!--
  The prefixes above may be unusual, but 'skos' and 'dcterms'
  would add an extra 15000 bytes to this file!
-->
<head profile="http://www.w3.org/1999/xhtml/vocab">
   <title property="d:title">Decimalised Database of Concepts</title>
   <base href="http://purl.org/NET/decimalised" />
</head>
<body about="#d" typeof="s:ConceptScheme">
   <h1 property="d:title">Decimalised Database of Concepts</h1>
   <p about="decimalised.html" property="rdfs:comment" datatype=""><strong>This page is marked up with XHTML+RDFa 1.0, but the <a rel="meta" href="decimalised.rdf">RDF/XML</a> and <a rel="meta" href="decimalised.nt">N-Triples</a> versions include much more information.</strong></p>
   <h2>Concepts</h2>
   <ul rev="s:inScheme">
EOF

my $lccPage = &get('http://en.wikipedia.org/w/index.php?title=Comparison_of_Dewey_and_Library_of_Congress_subject_classification&action=raw');
_utf8_on($lccPage);
my @lccLines = split /\r?\n/, $lccPage;
my $lccLinks;
my $lccEquiv;
my $lccTitle;
foreach (@lccLines)
{
   next unless /^\|(\d\d\d)\|\|([^\|]+)\|\|(.+)$/;
   my ($num, $equiv, $t) = ($1, $2, $3);
   
   my @links;
   $t =~ s/'{2,3}//g;
   $t =~ s/\[\[(.+?)\]\]/my ($uri, $label) = split m#\|#, $1; $label ||= $uri; push @links, $uri;$label/eg;
   $t =~ s/(^\s+)|(\s+$)//g;
   
   $lccLinks->{$num} = \@links;
   $lccEquiv->{$num} = $equiv;
   $lccTitle->{$num} = $t;
}

my @defs;
my @defsH;
my $ddcPage = &get('http://en.wikipedia.org/w/index.php?title=Outline_of_Dewey_Decimal_classes&action=raw');
_utf8_on($ddcPage);
my @ddcLines = split /\r?\n/, $ddcPage;
foreach (@ddcLines)
{
   # Remove some MediaWiki formatting.
   s/'{2,3}//g;
   
   # Remove links, but retain them.
   my @links;
   s/\[\[(.+?)\]\]/my ($uri, $label) = split m#\|#, $1; $label ||= $uri; push @links, $uri;$label/eg;

   # Some codings are unassigned.
   next if /(un|not )assigned/i;

   # Only interested in these.
   next unless /^\*\*?\s*(\d{3})\s+(.+)$/;
   
   my $num   = $1;
   my $topic = $2;
   
   $topic =~ s/(^\s+)|(\s+$)//g;

   my $optionalLink = '';
   $optionalLink = " <a href=\"http://id.loc.gov/authorities/".encode_entities_numeric($lcsh->{$num})."#concept\" rel=\"s:closeMatch\">(lcsh)</a>"
      if defined $lcsh->{$num};
   $defsH[$num] = "\t\t<li about=\"#c${num}\" typeof=\"s:Concept\"><b property=\"d:identifier s:notation\" rel=\"s:exactMatch\" resource=\"info:ddc/22/eng//${num}\">${num}</b> <span property=\"s:prefLabel\">".encode_entities_numeric($topic)."</span>$optionalLink</li>\n";
   
   my $d = "\t<Concept rdf:about=\"#c${num}\" dc:identifier=\"${num}\">\n";
   $d .= "\t\t<notation rdf:datatype=\"#number\">${num}</notation>\n";
   $d .= "\t\t<inScheme rdf:resource=\"#d\" />\n";

   $d .= "\t\t<prefLabel>".encode_entities_numeric($topic)."</prefLabel>\n";
   $d .= "\t\t<altLabel>".encode_entities_numeric($lccTitle->{$num})."</altLabel>\n"
      if (defined $lccTitle->{$num}) && ((lc$lccTitle->{$num}) ne (lc $topic));
   
   if ($num =~ /^(\d)(\d)([1-9])$/)
      { $d .= "\t\t<broader rdf:resource=\"#c${1}${2}0\" />\n"; }
   elsif ($num =~ /^(\d)([1-9])(0)$/)
      { $d .= "\t\t<broader rdf:resource=\"#c${1}00\" />\n"; }
   
   $d .= "\t\t<exactMatch rdf:resource=\"http://dewey.info/class/${num}/\" />\n";
   $d .= "\t\t<exactMatch rdf:resource=\"info:ddc/22/eng//${num}\" />\n";
   $d .= "\t\t<closeMatch rdf:resource=\"http://inkdroid.org/lcco/".encode_entities_numeric($lccEquiv->{$num})."\" />\n"
      if defined $lccEquiv->{$num};

   $d .= "\t\t<closeMatch rdf:resource=\"http://id.loc.gov/authorities/".encode_entities_numeric($lcsh->{$num})."#concept\" />\n"
      if defined $lcsh->{$num};

   push @links, @{ $lccLinks->{$num} };
   my %already = ();
   my $d2 = '';
   foreach my $l (@links)
   {
      $l =~ s/ /_/g;
      $l = CGI::Util::escape(ucfirst($l));
      
      next if $already{lc $l};
      
      $d2 .= "\t\t<foaf:page rdf:resource=\"http://en.wikipedia.org/wiki/".encode_entities_numeric($l)."\" />\n";
      $d .= "\t\t<relatedMatch rdf:resource=\"http://dbpedia.org/resource/".encode_entities_numeric($l)."\" />\n";
      
      $already{lc $l} = 1;
   }
   $d .= $d2; # This is just forces the Wikipedia links to the end of the XML block.
   
   $d .= "\t</Concept>\n\n";
   
   $defs[$num] = $d;
}

foreach (@defs)
{
   print XML $_;
}

foreach (@defsH)
{
   print HTML $_;
}

print XML "</rdf:RDF>\n";
print HTML <<EOF;
   </ul>
   <address about="#d">
      <span rel="d:creator"><a xmlns:foaf="http://xmlns.com/foaf/0.1/" typeof="foaf:Person" about="http://tobyinkster.co.uk/#i" rel="foaf:homepage" property="foaf:name" href="http://tobyinkster.co.uk/">Toby Inkster</a></span>
      | <span property="d:modified">$now</span>
      | <a rel="d:license" href="http://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA</a>
   </address>
</body>
</html>
EOF

close XML;
close HTML;

system('rapper decimalised.rdf | sort > decimalised.nt');