...
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
<?xml version="1.0" encoding="UTF-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schemas.xmlsoap.org/soap/envelope/
http://schemas.xmlsoap.org/soap/envelope/">
<soap:Body>
<ReportResponse xmlns:ctr="http://www.niso.org/schemas/sushi/counter" xsi:schemaLocation="http://www.niso.org/schemas/sushi/counter
[http://www.niso.org/schemas/sushi/counter_sushi3_0.xsd]" xmlns="http://www.niso.org/schemas/sushi" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<Requestor>
<ID>www.logaggregator.nl</ID>
<Name>Log Aggregator</Name>
<Email>logaggregator@surf.nl</Email>
</Requestor>
<CustomerReference>
<ID>www.leiden.edu</ID>
<Name>Leiden University</Name>
</CustomerReference>
<ReportDefinition Release="urn:DRv1" Name="Daily Report v1">
<Filters>
<UsageDateRange>
<Begin>2009-12-22</Begin>
<End>2009-12-23</End>
</UsageDateRange>
</Filters>
</ReportDefinition>
<Exception>
<Number>3</Number>
<Message>The report is not yet available. The estimated time of completion is
provided under "Data".</Message>
<Data>2010-01-08T12:13:00+01:00</Data>
</Exception>
</ReportResponse>
</soap:Body>
</soap:Envelope>
|
...
PloS | COUNTER | NEEO | AWstats | Description | ||
<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="9724cd752bad00a0-9f3e2cf6-49b14c84-8e519147-859aafdacce1e401434e5844"><ac:plain-text-body><![CDATA[ | [^a]fish |
|
|
| ]]></ac:plain-text-body></ac:structured-macro> | |
<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="fc20f5ceb86eae93-5349711f-448e45a2-94598f72-aa4845a042495ff0149f0544"><ac:plain-text-body><![CDATA[ | [+:,\.\;\/ |
|
|
| ||
acme\.spider |
|
|
| |||
alexa |
|
|
| |||
Alexandria(\s|)prototype(\s|)project | Alexandria prototype project |
|
|
| ||
AllenTrack |
|
|
| |||
almaden |
|
|
| |||
appie |
|
|
| |||
Arachmo | Arachmo |
|
|
| ||
archive\.org_bot |
|
|
| |||
arks |
|
|
| |||
asterias |
|
|
| |||
atomz |
|
|
| |||
autoemailspider |
|
|
| |||
awbot |
|
|
| |||
baiduspider |
|
|
| |||
bbot |
|
|
| |||
biadu |
|
|
| |||
biglotron |
|
|
| |||
bloglines |
|
|
| |||
blogpulse |
|
|
| |||
boitho\.com-dc |
|
|
| |||
bookmark-manager |
|
|
| |||
<ac:structured-macro ac:name="unmigrated-wiki-markup" ac:schema-version="1" ac:macro-id="bb484b3a2d5bc7c2-78469a77-46f34f09-b83c8977-9e2d05098ca95bd2480c8ac1"><ac:plain-text-body><![CDATA[ | bot[+:,\.\;\/ |
|
|
| ||
Brutus\/AET | Brutus/AET |
|
|
| ||
bspider |
|
|
| |||
bwh3_user_agent |
|
|
| |||
cfnetwork| checkbot |
|
|
| |||
China\sLocal\sBrowse\s2\.6 |
|
|
| |||
Code Sample Web Client |
|
|
| |||
combine |
|
|
| |||
commons-httpclient |
|
|
| |||
ContentSmartz |
|
|
| |||
core |
|
|
| |||
crawl |
|
|
| |||
cursor |
|
|
| |||
custo |
|
|
| |||
DataCha0s\/2\.0 |
|
|
| |||
Demo\sBot |
|
|
| |||
docomo |
|
|
| |||
DSurf |
|
|
| |||
dtSearchSpider | dtSearchSpider |
|
|
| ||
dumbot |
|
|
| |||
easydl |
|
|
| |||
EmailSiphon |
|
|
| |||
EmailWolf |
|
|
| |||
exabot |
|
|
| |||
fast-webcrawler |
|
|
| |||
favorg |
|
|
| |||
FDM(\s|+)1 | FDM 1 |
|
|
| ||
feedburner |
|
|
| |||
feedfetcher-google |
|
|
| |||
Fetch(\s|)API(\s|)Request | Fetch API Request |
|
|
| ||
findlinks |
|
|
| |||
gaisbot |
|
|
| |||
GetRight | GetRight |
|
|
| ||
geturl |
|
|
| |||
gigabot |
|
|
| |||
girafabot |
|
|
| |||
gnodspider |
|
|
| |||
Goldfire(\s|+)Server | Goldfire Server |
|
|
| ||
Googlebot | Googlebot |
|
|
| ||
grub |
|
|
| |||
heritrix |
|
|
| |||
hl_ftien_spider |
|
|
| |||
holmes |
|
|
| |||
htdig |
|
|
| |||
htmlparser |
|
|
| |||
httpget-5\.2\.2 | httpget-5.2.2 |
|
|
| ||
httrack |
|
|
| |||
HTTrack | HTTrack |
|
|
| ||
ia_archiver |
|
|
| |||
ichiro |
|
|
| |||
iktomi |
|
|
| |||
ilse |
|
|
| |||
internetseer |
|
|
| |||
iSiloX | iSiloX |
|
|
| ||
java |
|
|
| |||
jeeves |
|
|
| |||
jobo |
|
|
| |||
larbin |
|
|
| |||
libwww-perl | libwww-perl |
|
|
| ||
linkbot |
|
|
| |||
linkchecker |
|
|
| |||
linkscan |
|
|
| |||
linkwalker |
|
|
| |||
livejournal\.com |
|
|
| |||
lmspider |
|
|
| |||
LOCKSS |
|
|
| |||
LWP\:\:Simple | LWP::Simple |
|
|
| ||
lwp-request |
|
|
| |||
lwp-tivial |
|
|
| |||
lwp-trivial | lwp-trivial |
|
|
| ||
lycos |
|
|
| |||
mediapartners-google |
|
|
| |||
megite |
|
|
| |||
Microsoft(\s|)URL(\s|)Control | Microsoft URL Control |
|
|
| ||
milbot | Milbot |
|
|
| ||
mj12bot |
|
|
| |||
mnogosearch |
|
|
| |||
mojeekbot |
|
|
| |||
momspider |
|
|
| |||
motor |
|
|
| |||
msiecrawler |
|
|
| |||
msnbot |
|
|
| |||
MSNBot |
|
|
| |||
MuscatFerre |
|
|
| |||
myweb |
|
|
| |||
NABOT |
|
|
| |||
nagios |
|
|
| |||
NaverBot | NaverBot |
|
|
| ||
netcraft |
|
|
| |||
netluchs |
|
|
| |||
ng\/2\. |
|
|
| |||
no_user_agent |
|
|
| |||
nutch |
|
|
| |||
ocelli |
|
|
| |||
Offline(\s|+)Navigator | Offline Navigator |
|
|
| ||
OurBrowser |
|
|
| |||
perman |
|
|
| |||
pioneer |
|
|
| |||
playmusic\.com |
|
|
| |||
playstarmusic.com |
|
|
| |||
powermarks |
|
|
| |||
psbot |
|
|
| |||
python |
|
|
| |||
Python-urllib |
|
|
| |||
qihoobot |
|
|
| |||
rambler |
|
|
| |||
Readpaper | Readpaper |
|
|
| ||
redalert| robozilla |
|
|
| |||
robot |
|
|
| |||
scan4mail |
|
|
| |||
scooter |
|
|
| |||
seekbot |
|
|
| |||
seznambot |
|
|
| |||
shoutcast |
|
|
| |||
slurp |
|
|
| |||
sogou |
|
|
| |||
speedy |
|
|
| |||
spider |
|
|
| |||
spider |
|
|
| |||
spiderman |
|
|
| |||
spiderview |
|
|
| |||
Strider | Strider |
|
|
| ||
sunrise |
|
|
| |||
superbot |
|
|
| |||
surveybot |
|
|
| |||
T-H-U-N-D-E-R-S-T-O-N-E | T-H-U-N-D-E-R-S-T-O-N-E |
|
|
| ||
tailrank |
|
|
| |||
technoratibot |
|
|
| |||
Teleport(\s|+)Pro | Teleport Pro |
|
|
| ||
Teoma | Teoma |
|
|
| ||
titan |
|
|
| |||
turnitinbot |
|
|
| |||
twiceler |
|
|
| |||
ucsd |
|
|
| |||
ultraseek |
|
|
| |||
urlaliasbuilder |
|
|
| |||
voila |
|
|
| |||
w3c-checklink |
|
|
| |||
Wanadoo |
|
|
| |||
Web(\s|+)Downloader | Web Downloader |
|
|
| ||
WebCloner | WebCloner |
|
|
| ||
webcollage |
|
|
| |||
WebCopier | WebCopier |
|
|
| ||
Webinator |
|
|
| |||
Webmetrics |
|
|
| |||
webmirror |
|
|
| |||
WebReaper | WebReaper |
|
|
| ||
WebStripper | WebStripper |
|
|
| ||
WebZIP | WebZIP |
|
|
| ||
Wget | Wget |
|
|
| ||
wordpress |
|
|
| |||
worm |
|
|
| |||
Xenu(\s|)Link(\s|)Sleuth | Xenu Link Sleuth |
|
|
| ||
y!j |
|
|
| |||
yacy |
|
|
| |||
yahoo-mmcrawler |
|
|
| |||
yahoofeedseeker |
|
|
| |||
yahooseeker |
|
|
| |||
yandex |
|
|
| |||
yodaobot |
|
|
| |||
zealbot |
|
|
| |||
zeus |
|
|
| |||
zyborg |
|
|
|
The robotlist.txt might going to look like this
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
2010-05-06
[^a]fish
[+:,\.\;\/\\-]bot
acme\.spider
alexa
Alexandria(\s|\+)prototype(\s|\+)project
AllenTrack
almaden
appie
Arachmo
archive\.org_bot
arks
asterias
atomz
autoemailspider
awbot
baiduspider
bbot
biadu
biglotron
bloglines
blogpulse
boitho\.com\-dc
bookmark\-manager
bot[+:,\.\;\/\\-]
Brutus\/AET
bspider
bwh3_user_agent
cfnetwork| checkbot
China\sLocal\sBrowse\s2\.6
combine
commons\-httpclient
ContentSmartz
core
crawl
cursor
custo
DataCha0s\/2\.0
Demo\sBot
docomo
DSurf
dtSearchSpider
dumbot
easydl
EmailSiphon
EmailWolf
exabot
fast-webcrawler
favorg
FDM(\s|\+)1
feedburner
feedfetcher\-google
Fetch(\s|\+)API(\s|\+)Request
findlinks
gaisbot
GetRight
geturl
gigabot
girafabot
gnodspider
Goldfire(\s|\+)Server
Googlebot
grub
heritrix
hl_ftien_spider
holmes
htdig
htmlparser
httpget\-5\.2\.2
httrack
HTTrack
ia_archiver
ichiro
iktomi
ilse
internetseer
iSiloX
java
jeeves
jobo
larbin
libwww\-perl
linkbot
linkchecker
linkscan
linkwalker
livejournal\.com
lmspider
LOCKSS
LWP\:\:Simple
lwp\-request
lwp\-tivial
lwp\-trivial
lycos
mediapartners\-google
megite
Microsoft(\s|\+)URL(\s|+)Control
milbot
mj12bot
mnogosearch
mojeekbot
momspider
motor
msiecrawler
msnbot
MuscatFerre
myweb
NABOT
nagios
NaverBot
netcraft
netluchs
ng\/2\.
no_user_agent
nutch
ocelli
Offline(\s|\+)Navigator
OurBrowser
perman
pioneer
playmusic\.com
powermarks
psbot
python
qihoobot
rambler
Readpaper
redalert| robozilla
robot
scan4mail
scooter
seekbot
seznambot
shoutcast
slurp
sogou
speedy
spider
spider
spiderman
spiderview
Strider
sunrise
superbot
surveybot
T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E
tailrank
technoratibot
Teleport(\s|\+)Pro
Teoma
titan
turnitinbot
twiceler
ucsd
ultraseek
urlaliasbuilder
voila
w3c\-checklink
Wanadoo
Web(\s|\+)Downloader
WebCloner
webcollage
WebCopier
Webinator
Webmetrics
webmirror
WebReaper
WebStripper
WebZIP
Wget
wordpress
worm
Xenu(\s|\+)Link(\s|\+)Sleuth
y!j
yacy
yahoo\-mmcrawler
yahoofeedseeker
yahooseeker
yandex
yodaobot
zealbot
zeus
zyborg
|
Or it might look like the proposed XML version of the Robot exclusion list
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
<?xml version="1.0" encoding="UTF-8" ?>
<exclusions version="1.0" datestamp="2010-04-10">
<robot-list source="COUNTER" version="R3" datestamp="2010-04-01">
<description>Human-friendly description/notes about the COUNTER exclusion list</description>
<useragent>String to match for COUNTER</useragent>
<useragent>Another string to match for COUNTER</useragent>
<useragent>Etc.</useragent>
</robot-list>
<robot-list source="AWStats" version="x" datestamp="2009-10-02">
<description>Human-friendly description/notes about the AWStats exclusion list</description>
<useragent>String to match for AWStats</useragent>
<useragent>Another string to match for AWStats</useragent>
<useragent>Etc.</useragent>
</robot-list>
<robot-list source="PLoS" version="y" datestamp="2010-03-11">
<description>Human-friendly description/notes about the PLoS exclusion list</description>
<useragent>String to match for PLoS</useragent>
<useragent>Another string to match for PLoS</useragent>
<useragent>Etc.</useragent>
</robot-list>
</exclusions>
|