Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Section
Column
width50%

Table of contents

Table of Contents
maxLevel3
Column
width50%
Wiki Markup

h1. {anchor:Document information} Document information

| *Title:* KE Usage Statistics Guidelines \\
*Subject:* Usage Statistics,  Guidelines, Repositories, Publications, Research Intelligence \\
*Moderator:* Peter Verhaar ([KE Usage Statistics Work Group|standards:KE Usage Statistics Guidelines Work group]) \\
*Version:* 0.2 \\
*Date published:* 2010-04-13 \\
*Excerpt*:{excerpt} Guidelines for the exchange of usage statistics from a repository to a central server using OAI-PMH and OpenURL Context Objects. {excerpt}\\
\\
(Optional information) \\
*Type:* Guidelines, Technical Documentation \\
*Format:* html/text \\
*Identifier:* [http://purl.org/REP/standards/KE Usage Statistics Guidelines|http://purl.org/REP/standards/KE+Usage+Statistics+Guidelines]\\
*Language:* EN \\
*Rights:* CC-BY \\
*Tags:* {page-info:labels}Macro om labels te geven niet mogelijk in huidige installatie, wordt nog  geupdate |

Info

This page is maintained by: KE Usage Statistics Work Group

...

Code Block
xml
xml
titleListing 2
linenumberstrue
collapsetrue
<?xml version="1.0" encoding="UTF-8"?>
 <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/ [http://schemas.xmlsoap.org/soap/envelope/]">
 <soap:Body>
  <ReportRequest xmlns:ctr="http://www.niso.org/schemas/sushi/counter"
   xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter [http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]"
   xmlns="http://www.niso.org/schemas/sushi"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" >
   <Requestor>
    <ID>www.logaggregator.nl</ID>
    <Name>Log Aggregator</Name>
    <Email>logaggregator@surf.nl</Email>
   </Requestor>
   <CustomerReference>
    <ID>www.leiden.edu</ID>
    <Name>Leiden University</Name>
   </CustomerReference>
   <ReportDefinition Release="urn:robots-v1.xml" Name="Daily Report v1">
    <Filters>
     <UsageDateRange>
      <Begin>2009-12-21</Begin>
      <End>2009-12-22</End>
     </UsageDateRange>
    </Filters>
   </ReportDefinition>
  </ReportRequest>
 </soap:Body>
</soap:Envelope>

...

Code Block
xml
xml
titleListing 3
linenumberstrue
collapsetrue
<?xml version="1.0" encoding="UTF-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/ [http://schemas.xmlsoap.org/soap/envelope/]">
 <soap:Body>
  <ReportResponse xmlns:ctr="http://www.niso.org/schemas/sushi/counter"
   xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter [http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]"
   xmlns="http://www.niso.org/schemas/sushi"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" >
   <Requestor>
    <ID>www.logaggregator.nl</ID>
    <Name>Log Aggregator</Name>
    <Email>logaggregator@surf.nl</Email>
   </Requestor>
   <CustomerReference>
    <ID>www.leiden.edu</ID>
    <Name>Leiden University</Name>
   </CustomerReference>
   <ReportDefinition Release="urn:DRv1" Name="Daily Report v1">
    <Filters>
     <UsageDateRange>
      <Begin>2009-12-22</Begin>
      <End>2009-12-23</End>
     </UsageDateRange>
    </Filters>
   </ReportDefinition>
   <Report>
    <ctx:context-objects xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xmlns:dcterms="http://dublincore.org/documents/2008/01/14/dcmi-terms/"
     xmlns:ctx="info:ofi/fmt:xml:xsd:ctx">
     <ctx:context-object timestamp="2009-11\- 09T05:56:18+01:00">
       ...
     ...
</ctx:context-object>
    </ctx:context-objects>
   </Report>
  </ReportResponse>
 </soap:Body>
</soap:Envelope>

...

Code Block
xml
xml
titleListing 4
linenumberstrue
collapsetrue
 <?xml version="1.0" encoding="UTF-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/ [http://schemas.xmlsoap.org/soap/envelope/]">
<soap:Body>
<ReportResponse xmlns:ctr="http://www.niso.org/schemas/sushi/counter"
xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter [http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]"
xmlns="http://www.niso.org/schemas/sushi"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" >
<Requestor>
<ID>www.logaggregator.nl</ID>
<Name>Log Aggregator</Name>
<Email>logaggregator@surf.nl</Email>
</Requestor>
<CustomerReference>
<ID>www.leiden.edu</ID>
<Name>Leiden University</Name>
</CustomerReference>
<ReportDefinition Release="urn:DRv1" Name="Daily Report v1">
<Filters>
<UsageDateRange>
<Begin>2009-12-22</Begin>
<End>2009-12-23</End>
</UsageDateRange>
</Filters>
</ReportDefinition>
<Exception>
<Number>1</Number>
<Message>The range of dates that was provided is not valid. Only daily reports are
available.</Message>
</Exception> </ReportResponse>
</soap:Body>
</soap:Envelope>

...

Code Block
xml
xml
titleListing 5
linenumberstrue
collapsetrue
<?xml version="1.0" encoding="UTF-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/ [http://schemas.xmlsoap.org/soap/envelope/]">
<soap:Body>
<ReportResponse xmlns:ctr="http://www.niso.org/schemas/sushi/counter"
xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter [http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]"
xmlns="http://www.niso.org/schemas/sushi"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" >
<Requestor>
<ID>www.logaggregator.nl</ID>
<Name>Log Aggregator</Name>
<Email>logaggregator@surf.nl</Email>
</Requestor>
<CustomerReference>
<ID>www.leiden.edu</ID>
<Name>Leiden University</Name>
</CustomerReference>
<ReportDefinition Release="urn:DRv1" Name="Daily Report v1">
<Filters>
<UsageDateRange>
<Begin>2009-12-22</Begin>
<End>2009-12-23</End>
</UsageDateRange>
</Filters>
</ReportDefinition>
<Exception>
<Number>2</Number>
<Message>The file describing the internet robots is not accessible.</Message>
</Exception> </ReportResponse>
</soap:Body>
</soap:Envelope>

...

Code Block
xml
xml
titleListing 6
linenumberstrue
collapsetrue
<?xml version="1.0" encoding="UTF-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/
                                      [http://schemas.xmlsoap.org/soap/envelope/]">
 <soap:Body>
   <ReportResponse xmlns:ctr="http://www.niso.org/schemas/sushi/counter"
                    xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter
                                          [http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]"
                    xmlns="http://www.niso.org/schemas/sushi"
                    xmlns:xsi="http://xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" >
    <Requestor>
     <ID>www.logaggregator.nl</ID>
     <Name>Log Aggregator</Name>
     <Email>logaggregator@surf.nl</Email>
    </Requestor>
    <CustomerReference>
     <ID>www.leiden.edu</ID>
     <Name>Leiden University</Name>
    </CustomerReference>
    <ReportDefinition Release="urn:DRv1" Name="Daily Report v1">
     <Filters>
      <UsageDateRange>
       <Begin>2009-12-22</Begin>
       <End>2009-12-23</End>
      </UsageDateRange>
     </Filters>
    <</ReportDefinition>
    <Exception>
     <Number>3</Number>
     <Message>The report is not yet available. The estimated time of completion is
               provided under "Data".</Message>
     <Data>2010-01-08T12:13:00+01:00</Data>
    </Exception>
  </ReportResponse>
 </soap:Body>
</soap:Envelope>

...

<referrent>

 

SURE

Two identifiers for the requested document must given in separate <identifier> elements, namely the OAI-PMH identifier and the URL of the document.

OA-S

The <identifier> element contains an identifier of that resource and is repeated for every known identifier of the resource. This should also include identifiers of more abstract resources which the accessed resource is a part of, e. g. a journal identifier. In order to facilitate compatibility with the German OA Network project, the OAI-PMH identifier of the resource's metadata (as issued by the repository's metadata OAI-PMH data provider) shall also be included. All identifiers must be given in URI format.

NEEO

Two <ctx:identifier> descriptorsmust be present: (1) Identifier of the object file downloaded (must correspond to the value of the "ref" attribute in the NEEO-DIDL descriptor for the objectFile) and (2) OAI identifier of the publication (of which the specific object file is downloaded)
More <ctx:identifier> descriptors can be present. A good candidate is the identifier used in the root Item of the NEEO-DIDL representation of the publication3.

<referring-entity>

 

SURE

The URL that was received from the referrer and the classification of the search engine, if it was used, must be given in separate <identifier> elements.

OA-S

If available, a HTTP referrer has to be included in the ContextObject's <referring-entity>element. This indicates the entity which was directing to the used resource at the time of the usage event (if it was not forged). As a minimal requirement, this would be the URL provided by the HTTP referrer string. Additionally all known other identifiers for that resource may also be specified.

NEEO

URL received from referrer

<requester>

 

SURE

The <requester>, the agent who has requested the <refererent> must be identified by providing the C-class Subnet, and the encrypted IP-address must both be given in separate <identifier>s. In addition, the name of the country where the request was initiated must be provided. The <metadata-by-val> element must be used for this purpose. The country must be given in <dcterms:spatial>. The dcterms namespace must be declared in the <format> element as well.

OA-S

The <requester> element holds information about the agent that generated the usage event, basically identifying the user that triggered the event. This includes the IP address (in a hashed form for privacy reasons), Class-C network address (also hashed), host name (reduced to only first and second level domain name, also for privacy reasons), a classification of the agent, a session ID and User Agent string.

NEEO

If this entity is used in a NEEO-SWUP ContextObject, then at least one <ctx:identifier> must be present, holding the MD5-encryption of the IP address of the browser from which the usage event occurs.

<service-type>

 

SURE

The DC metadata term "type" is used to clarify whether the usage event involved a download of a object file or a metadata view. Choice between 'objectFile' and 'metadataView'.

OA-S

The <service-type> element classifies the used resource. This is based on metadata in the format specified by the „info:ofi/fmt:xml:xsd:sch_svc" scheme. This catalogue of classifications may be extended in a later stage of the OA Statistics project. The method of expressing this classification is prescribed by the ContextObject XML schema.

NEEO

The EO Gateway will consider all incoming NEEO-SWUP ContextObjects as representations for download events. This entity, if present, is ignored by the EO Gateway.

<referrer>

 

SURE

This element is not used

OA-S

This element is not used

NEEO

If this entity is used in a NEEO-SWUP ContextObject, then at least one <ctx:identifier> must be present, holding a string that corresponds to the User-Agent header, which is transmitted in the HTTP transaction that generates the download.

<resolver>

 

SURE

An <identifier> for the institution that provided access to the downloaded document must be given within <resolver>.

OA-S

This additional information may be optionally specified and is only sensible for link resolver usage data (as opposed to web servers or repositories). The <resolver> element specifies the URL of the OpenURL resolver itself. The <referrer> element specifies the identifier of the context from within the user triggered the usage of the target resource which is given via the <referent> element, and which was itself referenced by the <referring-entity> element (see above).

NEEO

Exactly 1 <ctx:identifier> descriptor must be present, holding the OAI baseURL for the repository that generates this ContextObject. This OAI baseURL must correspond exactly to the one given in the Admin file of the corresponding NEEO partner institution.

Appendix B: Robot filter list

The robot filter list is placed in a yet to be determined web location where services can easily find and use this list.

An internet robot is defined according to the definitions on this page.

the table below represents the robot list with descriptions as determined today (2010-05-06), this list can be changed according to the knowledge of the Working group.

PloS

COUNTER

Description

<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="96d6ff76-2b58-4664-8612-216e54198d7f"><ac:plain-text-body><![CDATA[

[^a]fish


 

]]></ac:plain-text-body></ac:structured-macro>

<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="55d4cc22-497e-4648-9dad-0ee945fcedc2"><ac:plain-text-body><![CDATA[

[+:,\.\;\/
-]bot


 

]]></ac:plain-text-body></ac:structured-macro>

acme\.spider


 

alexa


 

Alexandria(\s|)prototype(\s|)project

Alexandria prototype project

 

AllenTrack


 

almaden


 

appie


 

Arachmo

Arachmo

 

archive\.org_bot


 

arks


 

asterias


 

atomz


 

autoemailspider


 

awbot


 

baiduspider


 

bbot


 

biadu


 

biglotron


 

bloglines


 

blogpulse


 

boitho\.com-dc


 

bookmark-manager


 

<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="d92c89a0-d3ad-4c72-9193-335fb12a3316"><ac:plain-text-body><![CDATA[

bot[+:,\.\;\/
-]


 

]]></ac:plain-text-body></ac:structured-macro>

Brutus\/AET

Brutus/AET

 

bspider


 

bwh3_user_agent


 

cfnetwork| checkbot


 

China\sLocal\sBrowse\s2\.6


 


Code Sample Web Client

 

combine


 

commons-httpclient


 

ContentSmartz


 

core


 

crawl


 

cursor


 

custo


 

DataCha0s\/2\.0


 

Demo\sBot


 

docomo


 

DSurf


 

dtSearchSpider

dtSearchSpider

 

dumbot


 

easydl


 

EmailSiphon


 

EmailWolf


 

exabot


 

fast-webcrawler


 

favorg


 

FDM(\s|+)1

FDM 1

 

feedburner


 

feedfetcher-google


 

Fetch(\s|)API(\s|)Request

Fetch API Request

 

findlinks


 

gaisbot


 

GetRight

GetRight

 

geturl


 

gigabot


 

girafabot


 

gnodspider


 

Goldfire(\s|+)Server

Goldfire Server

 

Googlebot

Googlebot

 

grub


 

heritrix


 

hl_ftien_spider


 

holmes


 

htdig


 

htmlparser


 

httpget-5\.2\.2

httpget-5.2.2

 

httrack


 

HTTrack

HTTrack

 

ia_archiver


 

ichiro


 

iktomi


 

ilse


 

internetseer


 

iSiloX

iSiloX

 

java


 

jeeves


 

jobo


 

larbin


 

libwww-perl

libwww-perl

 

linkbot


 

linkchecker


 

linkscan


 

linkwalker


 

livejournal\.com


 

lmspider


 

LOCKSS


 

LWP\:\:Simple

LWP::Simple

 

lwp-request


 

lwp-tivial


 

lwp-trivial

lwp-trivial

 

lycos


 

mediapartners-google


 

megite


 

Microsoft(\s|)URL(\s|)Control

Microsoft URL Control

 

milbot

Milbot

 

mj12bot


 

mnogosearch


 

mojeekbot


 

momspider


 

motor


 

msiecrawler


 

msnbot


 


MSNBot

 

MuscatFerre


 

myweb


 

NABOT


 

nagios


 

NaverBot

NaverBot

 

netcraft


 

netluchs


 

ng\/2\.


 

no_user_agent


 

nutch


 

ocelli


 

Offline(\s|+)Navigator

Offline Navigator

 

OurBrowser


 

perman


 

pioneer


 

playmusic\.com


 


playstarmusic.com

 

powermarks


 

psbot


 

python


 


Python-urllib

 

qihoobot


 

rambler


 

Readpaper

Readpaper

 

redalert| robozilla


 

robot


 

scan4mail


 

scooter


 

seekbot


 

seznambot


 

shoutcast


 

slurp


 

sogou


 

speedy


 

spider


 

spider


 

spiderman


 

spiderview


 

Strider

Strider

 

sunrise


 

superbot


 

surveybot


 

T-H-U-N-D-E-R-S-T-O-N-E

T-H-U-N-D-E-R-S-T-O-N-E

 

tailrank


 

technoratibot


 

Teleport(\s|+)Pro

Teleport Pro

 

Teoma

Teoma

 

titan


 

turnitinbot


 

twiceler


 

ucsd


 

ultraseek


 

urlaliasbuilder


 

voila


 

w3c-checklink


 

Wanadoo


 

Web(\s|+)Downloader

Web Downloader

 

WebCloner

WebCloner

 

webcollage


 

WebCopier

WebCopier

 

Webinator


 

Webmetrics


 

webmirror


 

WebReaper

WebReaper

 

WebStripper

WebStripper

 

WebZIP

WebZIP

 

Wget

Wget

 

wordpress


 

worm


 

Xenu(\s|)Link(\s|)Sleuth

Xenu Link Sleuth

 

y!j


 

yacy


 

yahoo-mmcrawler


 

yahoofeedseeker


 

yahooseeker


 

yandex


 

yodaobot


 

zealbot


 

zeus


 

zyborg


 

The robotlist.txt might going to look like this

Code Block

2010-05-06
[^a]fish 
[+:,\.\;\/\\-]bot 
acme\.spider 
alexa 
Alexandria(\s|\+)prototype(\s|\+)project 
AllenTrack 
almaden 
appie 
Arachmo 
archive\.org_bot 
arks 
asterias 
atomz 
autoemailspider 
awbot 
baiduspider 
bbot 
biadu 
biglotron 
bloglines 
blogpulse 
boitho\.com\-dc 
bookmark\-manager 
bot[+:,\.\;\/\\-] 
Brutus\/AET 
bspider 
bwh3_user_agent 
cfnetwork| checkbot 
China\sLocal\sBrowse\s2\.6 
combine 
commons\-httpclient 
ContentSmartz 
core 
crawl 
cursor 
custo 
DataCha0s\/2\.0 
Demo\sBot 
docomo 
DSurf 
dtSearchSpider 
dumbot 
easydl 
EmailSiphon 
EmailWolf 
exabot 
fast-webcrawler 
favorg 
FDM(\s|\+)1 
feedburner 
feedfetcher\-google 
Fetch(\s|\+)API(\s|\+)Request 
findlinks 
gaisbot 
GetRight 
geturl 
gigabot 
girafabot 
gnodspider 
Goldfire(\s|\+)Server 
Googlebot 
grub 
heritrix 
hl_ftien_spider 
holmes 
htdig 
htmlparser 
httpget\-5\.2\.2 
httrack 
HTTrack 
ia_archiver 
ichiro 
iktomi 
ilse 
internetseer 
iSiloX 
java 
jeeves 
jobo 
larbin 
libwww\-perl 
linkbot 
linkchecker 
linkscan 
linkwalker 
livejournal\.com 
lmspider 
LOCKSS 
LWP\:\:Simple 
lwp\-request 
lwp\-tivial 
lwp\-trivial 
lycos 
mediapartners\-google 
megite 
Microsoft(\s|\+)URL(\s|+)Control 
milbot 
mj12bot 
mnogosearch 
mojeekbot 
momspider 
motor 
msiecrawler 
msnbot 
MuscatFerre 
myweb 
NABOT 
nagios 
NaverBot 
netcraft 
netluchs 
ng\/2\. 
no_user_agent 
nutch 
ocelli 
Offline(\s|\+)Navigator 
OurBrowser 
perman 
pioneer 
playmusic\.com 
powermarks 
psbot 
python 
qihoobot 
rambler 
Readpaper 
redalert| robozilla 
robot 
scan4mail 
scooter 
seekbot 
seznambot 
shoutcast 
slurp 
sogou 
speedy 
spider 
spider 
spiderman 
spiderview 
Strider 
sunrise 
superbot 
surveybot 
T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E 
tailrank 
technoratibot 
Teleport(\s|\+)Pro 
Teoma 
titan 
turnitinbot 
twiceler 
ucsd 
ultraseek 
urlaliasbuilder 
voila 
w3c\-checklink 
Wanadoo 
Web(\s|\+)Downloader 
WebCloner 
webcollage 
WebCopier 
Webinator 
Webmetrics 
webmirror 
WebReaper 
WebStripper 
WebZIP 
Wget 
wordpress 
worm 
Xenu(\s|\+)Link(\s|\+)Sleuth 
y!j 
yacy 
yahoo\-mmcrawler 
yahoofeedseeker 
yahooseeker 
yandex 
yodaobot 
zealbot 
zeus 
zyborg