-
Notifications
You must be signed in to change notification settings - Fork 1
/
expand.php
1167 lines (1044 loc) · 50 KB
/
expand.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?
// $Id: expand.php 432 2013-06-10 09:16:30Z MartinS $
// Returns pagecode if the calling script should continue; false otherwise
function expand($page, // Title of WP page
$commit_edits = false,
$editing_cite_doi_template = false, //If $editing_cite_doi_template = true, certain formatting changes will be applied for consistency.
$cite_doi_start_code = null // $cite_doi_start_code is wikicode specified if creating a cite doi template. (Possibly redundant now?)
) {
global $bot, $editInitiator, $html_output, $modifications, $edit_summary_end;
if ($html_output === -1) {
ob_start();
}
$file_revision_id = str_replace(array("Revision: ", "$", " "), "", '$Revision: 432 $');
$doitools_revision_id = revisionID();
if ($file_revision_id < $doitools_revision_id) {
$last_revision_id = $doitools_revision_id;
} else {
$editInitiator = str_replace($doitools_revision_id, $file_revision_id, $editInitiator);
$last_revision_id = $file_revision_id;
}
echo "\nRevision #$last_revision_id";
echo $html_output > 0 ? ("\n<hr>[" . date("H:i:s", $started_page_at) . "] Processing page '<a href='http://en.wikipedia.org/wiki/' " . addslashes($page) . "' style='text-weight:bold;'>$page</a>' — <a href='http://en.wikipedia.org/?title=". addslashes(urlencode($page))."&action=edit' style='text-weight:bold;'>edit</a>—<a href='http://en.wikipedia.org/?title=" . addslashes(urlencode($page)) . "&action=history' style='text-weight:bold;'>history</a> <script type='text/javascript'>document.title=\"Citation bot: '" . str_replace("+", " ", urlencode($page)) ."'\";</script>"):("\n\n\n*** Processing page '$page' : " . date("H:i:s"));
$bot->fetch(wikiroot . "title=" . urlencode($page) . "&action=raw");
$original_code = $bot->results;
if (stripos($original_code, "#redirect") !== FALSE) {
echo "Page is a redirect.";
updateBacklog($page);
return $original_code;
}
if (strpos($page, "Template:Cite") !== FALSE) {
$editing_cite_doi_template = true;
}
if ($editing_cite_doi_template && !$original_code) {
$original_code = $cite_doi_start_code;
}
if (preg_match("/\{\{nobots\}\}|\{\{bots\s*\|\s*deny\s*=[^}]*(Citation[ _]bot|DOI[ _]bot|all)[^}]*\}\}|\{\{bots\s*\|\s*allow=none\}\}/i", $original_code, $denyMsg)) {
echo "**** Bot forbidden by bots / nobots tag: $denyMsg[0]";
updateBacklog($page);
$commit_edits = false;
}
$new_code = expand_text($original_code, $commit_edits, $editing_cite_doi_template,
$cite_doi_start_code);
if (strtolower($new_code) == strtolower($original_code)) {
echo trim($new_code) ? "\n ** No changes required." : "\n ** Blank page retrieved.";
echo "\n # # # \n";
updateBacklog($page);
if ($html_output === -1) {
ob_end_clean();
}
return $original_code;
// If no changes are necessary, we don't need to do anything.
}
##### Generate edit summary #####
if ($modifications["additions"]) {
$auto_summary = "Add: ";
foreach ($modifications["additions"] as $param=>$v) {
$auto_summary .= "$param, ";
unset($modifications["additions"][$param]);
}
$auto_summary = substr($auto_summary, 0, strlen($auto_summary)-2);
$auto_summary .= ". ";
}
if ($modifications["removed"]["accessdate"]) {
$auto_summary .= "Removed accessdate with no specified URL. ";
unset($modifications["removed"]["accessdate"]);
}
if ($modifications["changes"]) {
$auto_summary .= "Tweak: ";
foreach ($modifications["changes"] as $param=>$v) $auto_summary .= "$param, ";
$auto_summary = substr($auto_summary,0, strlen($auto_summary)-2);
$auto_summary.=". ";
}
$auto_summary .= (($modifications["removed"])
? "Removed redundant parameters. "
: ""
) . (($modifications["cite_type"] || $unify_citation_templates)
? "Unified citation types. "
: ""
) . (($modifications["combine_references"])
? "Combined duplicate references. "
: ""
) . (($modifications["dashes"])
? "Formatted [[WP:ENDASH|dashes]]. "
: ""
) . (($modifications["arxiv_upgrade"])
? "Updated published arXiv refs. "
: ""
);
if (!$auto_summary) {
$auto_summary = "Misc citation tidying. ";
}
$modifications = null;
echo $auto_summary;
$edit_summary = $editInitiator . $auto_summary . $edit_summary_end;
if ($commit_edits) {
if (false /*todo remove false */ && strpos($page, "andbox") > 1) {
echo $html_output?"<br><i style='color:red'>Writing to <a href=\"http://en.wikipedia.org/w/index.php?title="
. urlencode($page) . "\">$page</a> <small><a href=http://en.wikipedia.org/w/index.php?title="
. urlencode($page) . "&action=history>history</a></small></i>\n\n</br><br>":"\n*** Writing to $page";
write($page . $_GET["subpage"], $new_code, $editInitiator . "Citation maintenance: Fixing/testing bugs. "
. "Problems? [[User_talk:Smith609|Contact the bot's operator]]. ");
} else {
echo "<br><i style='color:red'>Writing to <a href=\"http://en.wikipedia.org/w/index.php?title=".urlencode($page)."\">$page</a> ... ";
if (write($page . $_GET["subpage"], $new_code, $edit_summary) == "Success") {
updateBacklog($page);
echo "Success.";
} else {
echo "Edit may have failed. Retrying: <span style='font-size:1px'>xxx</span> ";
if (write($page . $_GET["subpage"], $new_code, $edit_summary) == "Success") {
updateBacklog($page);
echo "Success.";
} else {
echo "Still no good. One last try: ";
$status = write($page . $_GET["subpage"], $new_code, $edit_summary);
if ($status == "Success") {
updateBacklog($page);
echo "Success. Phew!";
} else {
echo "Failed. Error code: $status. " . ($html_output?$outputText:"Pagecode displayed in HTML output only");
}
}
}
echo $html_output ?
" <small><a href=http://en.wikipedia.org/w/index.php?title=" . urlencode($page) . "&action=history>history</a> / "
. "<a href=http://en.wikipedia.org/w/index.php?title=" . urlencode($page) . "&diff=prev&oldid="
. getLastRev($page) . ">last edit</a></small></i>\n\n<br>"
: ".";
}
##### Handle problems with cite doi templates #####
if ($editing_cite_doi_template) {
if (!articleID($page) && !$doiCrossRef && $oDoi) {
leave_broken_doi_message($page, $article_in_progress, $oDoi);
}
$doiCrossRef = null;
}
}
if ($html_output === -1) {
ob_end_clean();
}
// These variables should change after the first edit
$isbnKey = "3TUCZUGQ"; //This way we shouldn't exhaust theISBN key for on-demand users.
#$isbnKey2 = "RISPMHTS"; //This way we shouldn't exhaust theISBN key for on-demand users.
$edit_summary_end = " You can [[WP:UCB|use this bot]] yourself. [[WP:DBUG|Report bugs here]].";
return $new_code;
}
// This function, given $original_code, returns the $text with any citation templates expanded as far as possible.
function expand_text ($original_code,
$commit_edits = false,
$editing_cite_doi_template = false, //If $editing_cite_doi_template = true, certain formatting changes will be applied for consistency.
$cite_doi_start_code = null // $cite_doi_start_code is wikicode specified if creating a cite doi template. (Possibly redundant now?)
) {
global $p, $pStart, $editInitiator, $edit_summaryStart, $initiatedBy,
$authors_missing,
$edit_summary_end, $slow_mode, $html_output;
if ($html_output === -1) {
ob_start();
}
// Are multiple authors suppressed by 'display_authors'?
$display_authors = preg_match('~\|\s*display_authors\s*=~', $original_code);
// Which template family is dominant?
if (!$editing_cite_doi_template) {
preg_match_all("~\{\{\s*[Cc]ite[ _](\w+)~", $original_code, $cite_x);
preg_match_all("~\{\{\s*cite[ _](doi|pm|jstor|arx)~i", $original_code, $cite_id);
preg_match_all("~\{\{\s*[Cc]itation\b(?! \w)~", $original_code, $citation);
$cite_x_count = count ($cite_x[0]);
$citation_count = count ($citation[0]);
$cite_id_count = count ($cite_id[0]);
if ($cite_id_count > 3 || $cite_id_count + 1 >= ($cite_x_count + $citation_count - $cite_id_count)) {
echo "\n - switch to cite id format is supported.";
}
$harv_template_present = (stripos($original_code, "{{harv") === false)?false:true;
if ($cite_x_count * $citation_count > 0) {
// Two types are present
$unify_citation_templates = true;
$citation_template_dominant = ($cite_x_count < $citation_count);
echo "\n * " . (($citation_template_dominant)?"\"Citation\"":'"Cite xxx"') . " format is dominant on this page: " .
$cite_x_count . " cite / " . $citation_count . " citation." ;
} else {
$unify_citation_templates = false;
$citation_template_dominant = false;
}
}
// Start by fixing any sloppy wikicode:
echo "\n * Looking for bare references... ";
$new_code = preg_replace_callback("~(?P<open><ref[^>]*>)\[?(?P<url>http://(?:[^\s\]<]|<(?!ref))+) ?\]?\s*(?P<close></\s*ref>)~",
create_function('$matches',
'return $matches["open"] . url2template($matches["url"], $citation_template_dominant) . $matches["close"];'
),
$original_code);
// Check for baggage in a "Cite doi" template:
$cite_doi_baggage_regexp = "~({{[cC]ite doi\s*\|\s*)d?o?i?\s*[:.,;>]?\s*~";
if (preg_match($cite_doi_baggage_regexp, $new_code)) {
echo "\n Correcting broken Cite doi template";
$new_code = preg_replace($cite_doi_baggage_regexp, "$1", $new_code);
}
echo "\n * Tidying reference tags... ";
$new_code = rename_references(named_refs_in_reflist(combine_duplicate_references(combine_duplicate_references(ref_templates(ref_templates(ref_templates(ref_templates($new_code, "doi"), "pmid"), "jstor"), "pmc")))));
$pageDash_ereg = "p(p|ages)([\t ]*=[\t ]*[0-9a-Z]*[0-9][a-Z]*)[\t ]*(" . to_en_dash . ")[\t ]*([0-9A-Z])";
if (mb_ereg($pageDash_ereg, $new_code)) {
$new_code = mb_ereg_replace($pageDash_ereg, "p\\1\\2" . en_dash . "\\4", $new_code);
$modifications["dashes"] = true;
echo "\n - Converted dashes in all page parameters to en-dashes.";
}
################################### Cite web ######################################
// Convert Cite webs to Cite arXivs, etc, if necessary
if (false !== ($citation = preg_split("~{{((\s*[Cc]ite[_ ]?(?:[Nn]ews|[wW]eb)(?=\s*\|))([^{}]|{{.*}})*)([\n\s]*)}}~U", $new_code, -1, PREG_SPLIT_DELIM_CAPTURE))) {
$new_code = null;
$iLimit = (count($citation) - 1);
for ($cit_i = 0; $cit_i < $iLimit; $cit_i += 5) {//Number of brackets in cite arXiv regexp + 1
$started_citation_at = time();
$p = parameters_from_citation($citation[$cit_i + 1]);
//Make a note of how things started so we can give an intelligent edit summary
foreach ($p as $param => $value) {
if (is($param)) {
$pStart[$param] = $value[0];
}
}
// See if we can use any of the parameters lacking equals signs:
useUnusedData();
if (trim(str_replace("|", "", $p["unused_data"][0])) == "") {
unset($p["unused_data"]);
}
else if (substr(trim($p["unused_data"][0]), 0, 1) == "|") {
$p["unused_data"][0] = substr(trim($p["unused_data"][0]), 1);
}
echo "\n* Cite web / news: {$p["title"][0]}";
// Fix typos in parameter names
//Authors
if (isset($p["authors"]) && !isset($p["author"][0])) {$p["author"] = $p["authors"]; unset($p["authors"]);}
// Delete any parameters >10, which won't be displayed anyway
for ($au_i = 10; isset($p["last$au_i"]) || isset($p["author$au_i"]); $au_i++) {
unset($p["last$au_i"]);
unset($p["first$au_i"]);
unset($p["author$au_i"]);
}
// Get identifiers from URL
get_identifiers_from_url();
// Now wikify some common formatting errors - i.e. tidy up!
tidy_citation();
// Now: Citation bot task 5. If there's a journal parameter switch the citation to 'cite journal'.
$change_to_journal = is('journal') || is('bibcode') || is('jstor');
$change_to_arxiv = is('arxiv');
if (($change_to_arxiv || $change_to_journal) && is('eprint')) {
rename_parameter('eprint', 'arxiv');
$modifications["cite_type"] = true;
} else if (is('arxiv') && !is('class')) {
rename_parameter('arxiv', 'eprint');
}
//And we're done!
$endtime = time();
$timetaken = $endtime - $started_citation_at;
echo "\n* Citation assessed in $timetaken secs. "
. ($change_to_journal
?"Changing to Cite Journal. "
:($change_to_arxiv
?"Changing to Arxiv. "
:"Not changing citation template. ")
) . "\n";
$cText .= reassemble_citation($p); // This also populates $modifications["additions"] and $modifications["changes"], if 'set' hasn't got them already
$last_p = $p;
$p = null;
$new_code .= $citation[$cit_i] . ($cText?"{{" . ($change_to_journal?"cite journal":($change_to_arxiv?"cite arxiv":$citation[$cit_i+2])) . "$cText{$citation[$cit_i+4]}}}":"");
# $new_code .= $citation[$cit_i] . ($cText?"{{{$citation[$cit_i+2]}$cText{$citation[$cit_i+4]}}}":"");
$cText = null;
$crossRef = null;
}
$new_code .= $citation[$cit_i]; // Adds any text that comes after the last citation
}
################################### Cite arXiv ######################################
// Makes sense to do this first as it might add DOIs, changing the citation type.
if (false !== ($citation = preg_split("~{{((\s*[Cc]ite[_ ]?[aA]r[xX]iv(?=\s*\|))([^{}]|{{.*}})*)([\n\s]*)}}~U", $new_code, -1, PREG_SPLIT_DELIM_CAPTURE))) {
$new_code = null;
$iLimit = (count($citation)-1);
for ($cit_i=0; $cit_i<$iLimit; $cit_i+=5) {//Number of brackets in cite arXiv regexp + 1
$started_citation_at = time();
$p = parameters_from_citation($citation[$cit_i+1]);
//Make a note of how things started so we can give an intelligent edit summary
foreach($p as $param=>$value) {
if (is($param)) {
$pStart[$param] = $value[0];
}
}
// See if we can use any of the parameters lacking equals signs:
$freeDat = explode("|", trim($p["unused_data"][0]));
useUnusedData();
if (trim(str_replace("|", "", $p["unused_data"][0])) == "") unset($p["unused_data"]);
else if (substr(trim($p["unused_data"][0]), 0, 1) == "|") $p["unused_data"][0] = substr(trim($p["unused_data"][0]), 1);
echo "\n* {$p["title"][0]}";
// Fix typos in parameter names
//Authors
if (isset($p["authors"]) && !isset($p["author"][0])) {$p["author"] = $p["authors"]; unset($p["authors"]);}
// Delete any parameters >10, which won't be displayed anyway
for ($au_i = 10; isset($p["last$au_i"]) || isset($p["author$au_i"]); $au_i++) {
unset($p["last$au_i"]);
unset($p["first$au_i"]);
unset($p["author$au_i"]);
}
// Is there already a date parameter?
$dateToStartWith = (isset($p["date"][0]) && !isset($p["year"][0])) ;
echo $p["eprint"][0] . "\n";
if (is("eprint")
&& !(is("title") && is("author") && is("year") && is("version"))) {
$p["eprint"][0] = str_ireplace("arXiv:", "", $p["eprint"][0]);
echo " * Getting data from arXiv " . $p["eprint"][0];
if (!get_data_from_arxiv($p["eprint"][0]) && is("class")) {
get_data_from_arxiv($p["class"][0] . "/" . $p["eprint"][0]);
}
}
if (is ("doi") && !is("journal")) {
echo "\n * Fill in journal from CrossRef?";
get_data_from_doi($p["doi"][0]);
}
tidy_citation();
// Now: Citation bot task 5. If there's a journal parameter switch the citation to 'cite journal'.
$change_to_journal = is('journal');
if ($change_to_journal && is('eprint')) {
rename_parameter('eprint', 'arxiv');
unset($p['class']);
$modifications["arxiv_upgrade"] = true;
} else {
$modifications["cite_type"] = false;
}
//And we're done!
$endtime = time();
$timetaken = $endtime - $started_citation_at;
echo "* Citation assessed in $timetaken secs. " . ($change_to_journal?"Changing to Cite Journal. ":"Keeping as cite arXiv") . "\n";
$cText .= reassemble_citation($p); // This also populates $modifications["additions"] and $modifications["additions"]
$last_p = $p;
$p = null;
$new_code .= $citation[$cit_i] . ($cText?"{{" . ($change_to_journal?"cite journal":$citation[$cit_i+2]) . "$cText{$citation[$cit_i+4]}}}":"");
$cText = null;
$crossRef = null;
}
$new_code .= $citation[$cit_i]; // Adds any text that comes after the last citation
}
################################### START ASSESSING BOOKS ({{cite book}} ######################################
if (false !== ($citation = preg_split("~{{((\s*[Cc]ite[_ ]?[bB]ook(?=\s*\|))([^{}]|{{.*}})*)([\n\s]*)}}~U", $new_code, -1, PREG_SPLIT_DELIM_CAPTURE))) {
$new_code = null;
$iLimit = (count($citation)-1);
for ($cit_i = 0; $cit_i < $iLimit; $cit_i += 5) {//Number of brackets in cite book regexp +1
$started_citation_at = time();
// Remove any comments so they don't confuse any regexps.
if (preg_match_all("~<!--[\s\S]+-->~U", $citation[$cit_i+1], $comments)) {
$countComments = count($comments[0]);
for ($j = 0; $j < $countComments; $j++) {
$citation[$cit_i+1] = str_replace($comments[0][$j]
, sprintf(comment_placeholder, "b$j")
, $citation[$cit_i+1]);
}
} else $countComments = null;
$p = parameters_from_citation($citation[$cit_i+1]);
//Make a note of how things started so we can give an intelligent edit summary
foreach ($p as $param=>$value) {
if (is($param)) {
$pStart[$param] = $value[0];
}
}
//Check for the doi-inline template in the title
if (preg_match("~\{\{\s*doi-inline\s*\|\s*(10\.\d{4}/[^\|]+)\s*\|\s*([^}]+)}}~",
str_replace(pipePlaceholder, "|", $p['title'][0]), $match)) {
set('title', $match[2]);
set('doi', $match[1]);
}
if ($display_authors) handle_et_al();
useUnusedData();
id_to_parameters();
if (trim(str_replace("|", "", $p["unused_data"][0])) == "") {
unset($p["unused_data"]);
} else if (substr(trim($p["unused_data"][0]), 0, 1) == "|") {
$p["unused_data"][0] = substr(trim($p["unused_data"][0]), 1);
}
echo "\n* {$p["title"][0]}";
// Now, check for typos
$p = correct_parameter_spelling($p);
if (google_book_expansion()) {
echo "\n * Expanded from Google Books API.";
}
// Having expanded all that we can expand, tidy things up.
// edition -- remove 'edition' from parameter value
if (is("edition"))
{
$p["edition"][0] = preg_replace("~\s+ed(ition)?\.?\s*$~i", "", $p["edition"][0]);
}
if ($p["doi"][0] == "10.1267/science.040579197") {
// This is a bogus DOI from the PMID example file
unset ($p["doi"]);
}
//Authors
if (isset($p["authors"]) && !isset($p["author"][0])) {
$p["author"] = $p["authors"];
unset($p["authors"]);
}
// Is there already a date parameter?
$dateToStartWith = (isset($p["date"][0]) && !isset($p["year"][0]));
if (!isset($p["date"][0]) && !isset($p["year"][0]) && is('origyear')) {
rename_parameter('origyear', 'year');
}
$isbnToStartWith = isset($p["isbn"]);
if (!isset($p["isbn"][0]) && is("title")) set("isbn", findISBN( $p["title"][0], $p["author"][0] . " " . $p["last"][0] . $p["last1"][0]));
else {
echo "\n Already has an ISBN. ";
}
if (!$isbnToStartWith && !$p["isbn"][0]) unset($p["isbn"]);
/* ISBN lookup disabled -- too buggy.
if ( (is("pages") || is("page"))
&& is("title")
&& is("publisher")
&& (is("date") || is("year"))
&& (
is("author") || is("coauthors") || is("others")
|| is("author1")
|| is("author1-last")
|| is("last") || is("last1")
|| is("editor1-first") || is("editor1-last") || is("editor1")
|| is("editor") || is("editors")
)
)
echo "All details present - no need to look up ISBN. ";
else {
if (is("isbn")) getInfoFromISBN();
}
*/
##############################
# Finished with citation and retrieved ISBN data #
#############################
// Now wikify some common formatting errors - i.e. tidy up!
if (isset($p["title"][0]) && !trim($pStart["title"])) $p["title"][0] = niceTitle($p["title"][0]);
if (isset($p[$journal][0])) $p[$journal][0] = niceTitle($p[$journal][0], false);
if (isset($p["periodical"][0])) $p["periodical"][0] = niceTitle($p["periodical"][0], false);
if (isset($p["pages"][0]) && mb_ereg("([0-9A-Z])[\t ]*(-|\—|\xe2\x80\x94|\?\?\?)[\t ]*([0-9A-Z])", $p["pages"][0])) {
$p["pages"][0] = mb_ereg_replace("([0-9A-Z])[\t ]*(-|\—|\xe2\x80\x94|\?\?\?)[\t ]*([0-9A-Z])", "\\1\xe2\x80\x93\\3", $p["pages"][0]);
$modifications["dashes"] = true;
}
#if (isset($p["year"][0]) && trim($p["year"][0]) == trim($p["origyear"][0])) unset($p['origyear']);
#if (isset($p["publisher"][0])) $p["publisher"][0] = truncatePublisher($p["publisher"][0]);
if ($dateToStartWith) unset($p["year"]); // If there was a date parameter to start with, don't add a year too!
// If we have any unused data, check to see if any is redundant!
if (is("unused_data")) {
$freeDat = explode("|", trim($p["unused_data"][0]));
unset($p["unused_data"]);
foreach ($freeDat as $dat) {
$eraseThis = false;
foreach ($p as $oP) {
similar_text(strtolower($oP[0]), strtolower($dat), $percentSim);
if ($percentSim >= 85)
$eraseThis = true;
}
if (!$eraseThis) $p["unused_data"][0] .= "|" . $dat;
}
if (trim(str_replace("|", "", $p["unused_data"][0])) == "") unset($p["unused_data"]);
else {
if (substr(trim($p["unused_data"][0]), 0, 1) == "|") {
$p["unused_data"][0] = substr(trim($p["unused_data"][0]), 1);
}
echo "\n* Unused data in following book citation: {$p["unused_data"][0]}";
}
}
//And we're done!
$endtime = time();
$timetaken = $endtime - $started_citation_at;
echo "\n Book reference assessed in $timetaken secs.";
$cText .= reassemble_citation($p); // This also populates $modifications["additions"] and $modifications["additions"]
$last_p = $p;
$p = null;
// Convert into citation or cite journal, as appropriate
if ($citation_template_dominant) {
$citation[$cit_i+2] = preg_replace("~[cC]ite[ _]\w+~", "Citation", $citation[$cit_i+2]);
}
// Restore comments we hid earlier
for ($j = 0; $j < $countComments; $j++) {
$cText = str_replace(sprintf(comment_placeholder, "b$j")
, $comments[0][$j]
, $cText);
}
$new_code .= $citation[$cit_i] . ($cText?"{{{$citation[$cit_i+2]}$cText{$citation[$cit_i+4]}}}":"");
$cText = null;
$crossRef = null;
}
$new_code .= $citation[$cit_i]; // Adds any text that comes after the last citation
}
################################### START ASSESSING JOURNAL/OTHER CITATIONS ######################################
if (false !== ($citation = preg_split("~\{\{((\s*[Cc]ite[_ ]?[jJ]ournal(?=\s*\|)|\s*[Cc]ite[_ ]?[dD]ocument(?=\s*\|)|\s*[Cc]ite[_ ]?[Ee]ncyclopa?edia(?=\s*\|)|[cCite[ _]web(?=\s*\|)|\s*[cC]itation(?=\s*\|))([^\{\}]|\{\{.*\}\})*)([\n\s]*)\}\}~U", $new_code, -1, PREG_SPLIT_DELIM_CAPTURE))) {
$new_code = null;
$iLimit = (count($citation) - 1);
for ($cit_i = 0; $cit_i < $iLimit; $cit_i += 5) { //Number of brackets in cite journal regexp + 1
$started_citation_at = time();
// Strip comments, which may contain misleading pipes etc
if (preg_match_all("~<!--[\s\S]+-->~U", $citation[$cit_i+1], $comments)) {
$countComments = count($comments[0]);
for ($j = 0; $j < $countComments; $j++) {
$citation[$cit_i+1] = str_replace($comments[0][$j]
, sprintf(comment_placeholder, "c$j")
, $citation[$cit_i+1]);
}
} else {
// Comments will be replaced in the cText variable later
$countComments = null;
}
$p = parameters_from_citation($citation[$cit_i+1]);
$authors_missing = false; // reset
if ($p["doix"]) {
$p["doi"][0] = str_replace($dotEncode, $dotDecode, $p["doix"][0]);
unset($p["doix"]);
}
//Make a note of how things started so we can give an intelligent edit summary
foreach ($p as $param=>$value) if (is($param)) {
$pStart[$param] = $value[0];
}
if (is("inventor") ||
is("inventor-last") ||
is("patent-number")) {
echo "\n xxx Citation bot does not handle patent citations.";
} else {
//Check for the doi-inline template in the title
if (preg_match("~\{\{\s*doi-inline\s*\|\s*(10\.\d{4}/[^\|]+)\s*\|\s*([^}]+)}}~"
, str_replace(pipePlaceholder, "|", $p['title'][0])
, $match
)
) {
set('title', $match[2]);
set('doi', $match[1]);
}
id_to_parameters();
###########################
// JOURNALS
//
echo "
*-> {$p["title"][0]}
1: Tidy citation and try to expand
";
// See if we can get any 'free' metadata from:
// * mis-labelled parameters
// * ISBN
// * SICI
// * Tidying up existing parameters (and we'll do more tidying here too)
//
###########################
if ($display_authors) handle_et_al();
$journal = is("periodical") ? "periodical" : "journal";
// See if we can use any of the parameters lacking equals signs:
useUnusedData();
if (google_book_expansion()) {
echo "\n * Expanded from Google Books API.";
}
/*if (is("url") && !is("journal") && !is("periodical") && !is("magazine") && !is("newspaper")) {
SpencerK's API; disabled until I check whether it is ever a source of errors
if_null_set("publisher", trim(file_get_contents("http://referee.freebaseapps.com/?url=" . $p["url"][0])));
}*/
/* ISBN lookup removed - too buggy. TODO (also commented out above)
if (is("isbn")) getInfoFromISBN();
*/
// If the page has been created manually from a cite doi link, it will have an encoded 'doix' parameter - decode this.
if (preg_match("~^10.\d~", $p['doix'][0])) {
$p['doi'][0] = str_replace($dotEncode, $dotDecode, $p['doix'][0]);
unset($p['doix']);
}
get_identifiers_from_url();
id_to_parameters();
if (trim(str_replace("|", "", $p["unused_data"][0])) == "") {
unset($p["unused_data"]);
} else {
if (substr(trim($p["unused_data"][0]), 0, 1) == "|") {
$p["unused_data"][0] = substr(trim($p["unused_data"][0]), 1);
}
}
if (trim ($p["quotes"][0]) == "yes" || trim ($p["quotes"][0]) == "no") {
unset ($p["quotes"]);
}
// Load missing parameters from SICI, if we found one...
get_data_from_sici($citation[$cit_i+1]);
// Fix typos in parameter names
$p = correct_parameter_spelling($p);
// DOI - urldecode
if (isset($p["doi"][0])) {
$p["doi"][0] = str_replace($pcEncode, $pcDecode,
str_replace(' ', '+', trim(urldecode($p["doi"][0]))));
$doi_with_comments_removed = preg_replace("~<!--[\s\S]*-->~U", "", $p["doi"][0]);
if (preg_match("~10\.\d{4}/\S+~", $doi_with_comments_removed, $match) && $p["doi"][0] != $match[0]) {
set("doi", $match[0]);
}
} elseif (preg_match("~10\.\d{4}/[^&\s\|]*~", $p["url"][0], $match)) {
// Search the URL for anything in a DOI format.
$p["doi"]= $p["url"];
$p["doi"][0] = preg_replace("~(\.x)/(?:\w+)~", "$1", $match[0]);
unset ($p["url"]);
} elseif (preg_match("~10\.\d{4}/[^&\s\|]*~", urldecode($c), $match)) {
// Search the entire citation text for anything in a DOI format.
// This is quite a broad match, so we need to ensure that no baggage has been tagged on to the end of the URL.
// Wiley have a habit of using the DOI as part of the URL, so we ought to trim any /abstract or /pdf that's following it.
$p["doi"][0] = preg_replace("~(\.x)/(?:\w+)~", "$1", $match[0]);
}
$doiToStartWith = isset($p["doi"]);
// Check that the DOI works; if not, fix it.
verify_doi($p["doi"][0]);
// co-authors
if (is('co-author') && !is('coauthors') && !is('coauthor')) {
$p['coauthor'] = $p['co-author'];
unset($p['co-author']);
}
if (is('co-authors') && !is('coauthors') && !is('coauthor')) {
$p['coauthors'] = $p['co-authors'];
unset($p['co-authors']);
}
// pmid = PMID 1234 can produce pmpmid = 1234
if (isset($p["pmpmid"])) {
$p["pmid"] = $p["pmpmid"];
unset($p["pmpmid"]);
}
//Authors
// Move authors -> author
if (isset($p["authors"]) && !isset($p["author"][0])) {
$p["author"] = $p["authors"];
unset($p["authors"]);
}
// Replace "volume = B 120" with "series=VB, volume = 120
if (preg_match("~^([A-J])(?!\w)\d*\d+~u", $p["volume"][0], $match)) {
if (trim($p["journal"][0]) && mb_substr(trim($p["journal"][0]), -2) != " $match[1]") {
$p["journal"][0] .= " $match[1]";
$p["volume"][0] = trim(mb_substr($p["volume"][0], mb_strlen($match[1])));
}
}
$author_param = trim($p['author'][0]);
// Check for translator in author_param and remove if necessary.
$translator_regexp = "~\b([Tt]r(ans(lat...?(by)?)?)?\.)\s([\w\p{L}\p{M}\s]+)$~u";
if (preg_match($translator_regexp, $author_param, $match)) {
if (is('others')) {
$p['others'][0] .= "; {$match[1]} {$match[5]}";
} else {
set ("others", "{$match[1]} {$match[5]}");
}
// Remove translator from both author_parm and $p
$author_param = preg_replace($translator_regexp, "", $author_param);
$p['author'][0] = $author_param;
}
// Is there already a date parameter?
$dateToStartWith = (isset($p["date"][0]) && !isset($p["year"][0])) ;
// By this point we'll have recovered any DOI or PMID that is hidden in the citation data itself.
#####################################
//
if (is('doi')) {
if (!nothingMissing($journal)) {
expand_from_crossref($crossRef);
}
echo "
2: DOI already present";
// TODO: Use DOI to expand citation
} else {
echo "
2: Find DOI";
// Now we have got the citation ship-shape, let's try to find a DOI.
//
#####################################
// First, expand citation by any available means.
// Try AdsAbs
if ($slow_mode || is('bibcode')) {
echo "\n - Checking AdsAbs database [expand/expand_text]";
get_data_from_adsabs();
} else {
echo "\n - Skipping AdsAbs database: not in slow mode.";
}
// Expand from JSTOR
if (!isset($p["doi"][0])) {
if (is("jstor")) {
echo "\n - Checking JSTOR database [expand/expand_text]";
get_data_from_jstor("10.2307/" . $p["jstor"][0]);
}
}
// We should now have enough information to find a DOI through CrossRef
if (!$p["doi"][0]) {
//Ask CrossRef for a DOI
echo "\n - Checking CrossRef database... ";
$crossRef = crossRefDoi(trim($p["title"][0]), trim($p[$journal][0]),
get_first_author($p), trim($p["year"][0]), trim($p["volume"][0]),
get_first_page($p), get_last_page($p), trim($p["issn"][0]), trim($p["url"][0]));
if ($crossRef) {
$p["doi"][0] = $crossRef->doi;
echo "Match found: " . $p["doi"][0];
} else {
echo "no match.";
}
}
// If that didn't work, we can try scraping a DOI from the URL. Meta tags are usually our best bet here, see findDoi().
if (!isset($p["doi"][0])) {
if (strpos($p["url"][0], "http://") !== false) {
if (substr(preg_replace("~<!--.*-->~", "", $p["url"][0]), -4) == ".pdf") {
echo $html_output
? ("\n - Avoiding <a href=\"" . $p["url"][0] . "\">PDF URL</a>. <br>")
: "\n - Avoiding PDF URL {$p["url"][0]}";
} else {
//Try using URL parameter
echo $html_output
? ("\n - Trying <a href=\"" . $p["url"][0] . "\">URL</a>. <br>")
: "\n - Trying URL {$p["url"][0]}";
$doi = findDoi(preg_replace("~<!--.*-->~", "", $p["url"][0]));
if ($doi) {
echo " found doi $doi";
$p["doi"][0] = $doi;
} else {
echo " no doi found.";
}
}
} else {
echo "No valid URL specified. ";
}
}
}
if (!$doiToStartWith && !is("doi")) unset($p["doi"]);
#####################################
//
if (is ('pmid')) {
echo "\n3: PMID already present";
} else {
echo "\n3: Find PMID & expand";
searchForPmid();
}
if (!nothingMissing($journal) && is('pmid')) {
get_data_from_pubmed();
}
if (!nothingMissing($journal) && is('pmc')) {
get_data_from_pubmed('pmc');
}
#####################################
if (nothingMissing($journal)) {
echo "\n4: Citation complete :-)";
} else {
echo "\n4: Expand citation";
if (is("doi")) {
$crossRef = expand_from_crossref($crossRef, $editing_cite_doi_template);
} else {
echo "\n - No DOI; can't check CrossRef";
$crossRef = null;
}
}
#####################################
// We have now recovered all possible information from CrossRef.
//If we're using a Cite Doi subpage and there's a doi present, check for a second author. Only do this on first visit (i.e. when citedoi = true)
echo "\n5: Formatting and other tweaks";
if ($editing_cite_doi_template || preg_match("~[cC]ite[ _](?:doi|pmid|jstor|pmc)~", $page)) {
echo "\n First: Cite Doi formatting";
// If we only have the first author, look for more!
if (!is('coauthors')
&& !is('author2')
&& !is('last2')
&& is('doi')
) {
echo "\n - Looking for co-authors & page numbers...";
$moreAuthors = findMoreAuthors($p['doi'][0], get_first_author($p), $p['pages'][0]);
$count_new_authors = count($moreAuthors['authors']);
if ($count_new_authors) {
echo " Found more authors! ";
for ($j = 0; $j < $count_new_authors; $j++) {
$au = explode(', ', $moreAuthors['authors'][$j]);
if ($au[1]) {
set ('last' . ($j+1), $au[0]);
set ('first' . ($j+1), preg_replace("~(\p{L})\p{L}*\.? ?~", "$1.", $au[1]));
unset($p['author' . ($j+1)]);
} else {
if ($au[0]) {
set ('author' . ($j+1), $au[0]);
}
}
}
unset($p['author']);
}
if ($moreAuthors['pages']) {
set('pages', $moreAuthors['pages']);
echo " Completed page range! (" . $p['pages'][0] . ')';
}
}
for ($i = 1; $i < 100; $i ++) {
foreach (array("author", "last", "first") as $param) {
if (trim($p[$param . $i][0]) == "") {
unset ($p[$param . $i]);
}
}
}
citeDoiOutputFormat();
}
// Check that the URL functions, and mark as dead if not.
/* Disable; to re-enable, we should log possible 404s and check back later.
* Also, dead-link notifications should be placed ''after'', not within, the template.
if (!is("format") && is("url") && !is("accessdate") && !is("archivedate") && !is("archiveurl"))
{
echo "\n - Checking that URL is live...";
$formatSet = isset($p["format"]);
$p["format"][0] = assessUrl($p["url"][0]);
if (!$formatSet && trim($p["format"][0]) == "") {
unset($p["format"]);
}
echo "Done" , is("format")?" ({$p["format"][0]})":"" , ".</p>";
}*/
}
// Neaten capitalisation for journal
if (isset($p[$journal][0])) {
$p[$journal][0] = niceTitle($p[$journal][0], false);
}
// If there was a date parameter to start with, don't add a year too. This will be created by the template.
if ($dateToStartWith) {
unset($p["year"]);
}
// Check each author for embedded author links
for ($au_i = 1; $au_i < 100; $au_i++) {
if (preg_match("~\[\[(([^\|]+)\|)?([^\]]+)\]?\]?~", $p["author$au_i"][0], $match)) {
if_null_set("authorlink$au_i", ucfirst($match[2]?$match[2]:$match[3]));
set("author$au_i", $match[3]); // Replace author with unlinked version
echo "Dissecting authorlink";
}
}
// Check that the DOI functions.
if (trim($p["doi"][0]) != "" && trim($p["doi"][0]) != "|" && $slow_mode) {
echo "\nChecking that DOI {$p["doi"][0]} is operational...";
$brokenDoi = isDoiBroken($p["doi"][0], $p, $slow_mode);
if ($brokenDoi && !is("doi_brokendate") && !is("doi_inactivedate")) {
set("doi_inactivedate", date("Y-m-d"));
echo "\n\n $doi \n\n";
}
ELSE if (!$brokenDoi) unset($p["doi_brokendate"]); unset($p["doi_inactivedate"]);
echo $brokenDoi?" It isn't.":" It is.", "</p>";
}
// Clean up after errors in publishers' databases
if (0 === strpos(trim($p["journal"][0]), "BMC ") && $p["pages"][0]) {
unset ($p["issue"]);
echo "\n - dropping issue number (BMC journals only have page numbers)";
}
if ($p["doi"][0] == "10.1267/science.040579197") {
// This is a bogus DOI from the PMID example file
unset ($p["doi"]);
}
tidy_citation();
if ($unify_citation_templates) {
if ($citation_template_dominant) {
if (preg_match("~[cC]ite[ _]\w+~", $citation[$cit_i+2])) {
// Switching FROM cite xx TO citation; cite xx has a trailing period by default
if_null_set("postscript", ".");
$citation[$cit_i+2] = preg_replace("~[cC]ite[ _]\w+~", "Citation", $citation[$cit_i+2]);
}
} else {
if ($harv_template_present) {
if_null_set("ref", "harv");
}
if (preg_match("~[cC]itation~", $citation[$cit_i+2])) {
// Switching FROM cite xx TO citation; citation has no trailing period by default
if_null_set("postscript", "<!-- Bot inserted parameter. Either remove it; or change its value to \".\" for the cite to end in a \".\", as necessary. -->{{inconsistent citations}}");
}
if (is('inventor-last') || is('inventor-surname') || is('inventor1-surname')
|| is('inventor1-last') || is ('inventor')) {
$citeTemplate = "Cite patent";
}
elseif (is('journal')) {$citeTemplate = "Cite journal";}
elseif (is('agency') || is('newspaper') || is('magazine') || is('periodical')) {
$citeTemplate = "Cite news";
}
elseif (is('encyclopedia')) {
$citeTemplate = "Cite encyclopedia";
}
elseif (is('conference') || is('conferenceurl')) {$citeTemplate = "Cite conference";}
// Straightforward cases now out of the way... now for the trickier ones
elseif (is('chapter') || is('editor') || is('editor-last') || is('editor1') || is('editor1-last')) {
$citeTemplate = "Cite book";
}
elseif (!is('date') && !is('month') && (is('isbn') || is("oclc" || is("series")))) {
// Books usually catalogued by year; no month expected
$citeTemplate = "Cite book";
}
elseif (is('publisher')) {
// This should be after we've checked for a journal parameter
if (preg_match("~\w\.\w\w~", $p['publisher'][0])) {
// it's a fair bet the publisher is a web address
$citeTemplate = "Cite web";
} else {
$citeTemplate = "Cite document";
}
}
elseif (is('url')) {$citeTemplate = "Cite web";} // fall back to this if URL
else {$citeTemplate = "Cite document";} // If no URL, cite journal ought to handle it okay
$citation[$cit_i+2] = preg_replace("~[cC]itation~", $citeTemplate, $citation[$cit_i+2]);
}
}
$cText .= reassemble_citation($p, $editing_cite_doi_template); // This also populates $modifications["additions"] and $modifications["additions"]
//And we're done!
$endtime = time();
$timetaken = $endtime - $started_citation_at;
echo "\n*** Complete. Citation assessed in $timetaken secs.\n\n\n";
// Restore comments we hid earlier
for ($j = 0; $j < $countComments; $j++) {
$cText = str_replace(array(sprintf(comment_placeholder, "c$j"),
str_replace($dotEncode, $dotDecode, sprintf(comment_placeholder, "c$j")),