Skip to content

Instantly share code, notes, and snippets.

@mhbeals
Last active April 19, 2018 09:48
Show Gist options
  • Save mhbeals/415df6eb58b434cff7d5b8002577a2ce to your computer and use it in GitHub Desktop.
Save mhbeals/415df6eb58b434cff7d5b8002577a2ce to your computer and use it in GitHub Desktop.
A script to download all (as of April 2018) newspaper articles from Trove
import urllib
import requests
from lxml import etree
import xml.etree.ElementTree as ET
# Use one of these lists or visit https://gist.github.com/mhbeals/1ad7cd04ca0f8fd74e12f6151664873e for full listing
list_pre_1840 = [3,1046,1047,4,5,50,273,22,23,76,1230,19,24,1282,1235,693,95,272,1236,695,696,1237,37,525,944,6,1233,1239,869,694,945,1240,1242,66,170,1013,1238,40,1142,1232,1241,1329,935,171,96,1137,936,1243,41,20,48,1231,984]
list_1840 = [1330,1030,1331,986,181,21,1037,339,292,1014,1015,867,1336,1027,1012,1022,1036,74,1026,1025,1039,35,110,8,1033,1028,1035,190,821,1040,1034,78,1020,1018,1023,1038,1031,1021,690,178,1234,1011,172,58,1029,1016,285,160,1024,1004,1017,1138,937,1019,1032,941,18,14,94,863,864,987,284,26,54,364,13,1100,938,939,55]
list_1850 = [1244,314,464,67,1139,277,56,1041,283,174,805,107,1339,582,31,809,326,1245,948,213,180,874,346,189,484,262,161,669,985,706,193,685,162,496,959,32,1324,33,163,1246,257,1247,7,287,1053,382,380,164,558,1248,104,1054,65,381,353]
list_1860 = [365,415,927,478,10,697,808,394,57,479,731,278,1190,958,376,276,15,150,92,1056,108,377,940,319,807,263,510,1249,239,360,371,264,27,361,1092,756,103,366,16,83,460,28,399,659,185,406,391,125,126,392,1068,865,383,1062,42,260,373,1258,848,246,354,1093,1250,1259,386,259,835,1340,1260,1263,684,84,72,114,288,487,792,838,153,527,412,839,118,966,311,953,372,1334,1050,1148,223,621,508,232,1261,241]
list_1870 = [127,52,567,954,1264,575,946,698,810,725,91,1266,222,950,803,428,840,1171,49,151,184,656,9,1149,355,258,463,29,282,512,1226,902,1332,1069,1145,1172,186,122,1150,374,413,1224,1057,715,720,891,70,356,955,474,81,951,109,719,43,208,639,1144,154,155,988,195,734,862,1265,77,123,1335,850,398,475,963,1262,1052,59,1173,1065,1128,1167,964,701,631,1296,265,261,137,30,712]
list_1880 = [483,281,990,1146,714,947,368,834,248,367,289,177,416,1072,152,1154,471,855,503,804,405,334,225,209,1141,240,350,728,145,516,275,853,234,1270,338,231,80,742,521,866,965,134,1297,60,217,387,335,218,671,1096,624,991,522,235,1147,993,230,713,337,204,1177,1220,357,101,1298,135,196,501,388,499,655,1206,1042,132,1183,949,836,1074,201,200,1199,198,389,402,657,425,214,426,1178,341,1212,952,1200,215,199,194,252,228,133,1086,192,397,472,819,545,351,511,211,994,247,519,53,956,290,34,925,1048,906,111,105,244,826,64,825,565,650,205]
list_1890 = [191,648,71,824,207,960,716,1077,1051,117,270,1201,1075,806,124,518,1049,128,268,641,700,970,1269,745,885,129,647,847,675,266,1316,403,829,269,212,502,688,747,345,858,203,1070,271,130,842,1209,296,1076,140,431,492,188,643,493,309,699,138,197,730,384,1279,704,983,907,407,1222,99,530,1103,505,139,1338,1198,591,1165,1083,638,922,724,870,342,1203,291,450,488,385,447,1272,663,436,245,1277,846,424,73,528,506,873,845,340,274,817,1273,1202,843,1251,737,175,79,1161,961,116,1204,98,120,721,886,489,877,469,702,490,645,1176,498,459,736,969,430,141,975,1185,136,348,1268,142,1120,486,1091,393,480,362,115,881,636,1153,880,452,448,904,396,1160,1006,121,660,1278,887,976,176]
list_1900s = [68,453,691,748,683,379,434,723,1159,143,596,841,1271,658,202,755,741,1317,437,942,692,980,467,726,981,876,1152,497,1214,526,1252,477,1166,717,89,1067,297,523,593,419,336,494,735,524,466,1299,1349,458,705,1218,93,744,547,1309,179,1253,514,888,1184,553,446,146,680,908,455,1181,427,982,286,1194,1318,1097,1058,414,739,1205,445,113,854,1163,390,1195,992,923,500,1219,823,224,517,729,1043,979,1281,1079,644,973,1192,1081,254,957,443,473,1117,1223,971,837,495,451,898,465,1322,532,689,857,1089,1125,168,173,795,169,1059,859,974,989,1105,1193,97,814,515,816,924,62,86,633,1227,156,860,1158,897,1162,206,903,1064,635,1168,674,378,1066,1156,444,1179,61,1300,977,482,1157,1228,1098,818,743,879,667,395,1088,491,1063,681,1180,504,709,1301,159,646,738,820,1182,481,899,687,812,420,470,872,1225,1143,210,896,1170,358,1197,928,330,238,607,1155,421,423,422,637,1221,708,740,703,1333,75,654,828,1078,815,830,967,529,1118,1127,893,652,352,653,343,456,454,733,1116,564,401,1164,918,1087,746,449,429,1007,534,539,542,312,751,303,776,602,606,242,293,546,322,576,317,583,306,307,794,1325,768,618,227,934,978,754,750,544,763,550,551,774,324,573,574,781,313,598,600,318,789,798,608,229,793,613,302,933,932,298,331,605,300,249,315,541,316,563,321,762,570,752,753,777,587,588,592,601,604,609,320,536,560,766,577,578,581,771,597,299,301,537,548,556,561,568,584,614,616,785,554,555,559,562,569,571,580,773,791,585,611,615,617,783,619,622,759,790,295,535,761,333,589,304,594,328,599,787,538,586,603,780,612,243,797,507,682,782,1186,889,620,323,1129,308,784,626,250,796,765,1084,610,800,329,557,1169,359,661,827,707,758,883,87,468,770,788,549,757,595,566,332,772,769,767,779,649,1130,440,878,1005,438,305,590,811,931,1131,409,485,552,844,1133,786,882,1085,775,131,686,930,579,325,439,513,678,1124,533,813,100,929,432,1126,310,894,1107,764,147,327,441,1123,1302,294,926,1080,540,1071,349,651,861,799,760,1115,572,822,457,749,543,1110,509,778,679,900,46,634,347,892,1229,623,404,1119,158,919,344,1114,1111,255,418,909,995,433,236,1313,1009,25,253,997,1256,267,1008,148,875,1094,106,102,1101,1280,1003,1010,461,968,884,628,1274,531,1284,917,1315,251,676,1106,1257,895,868,943,417,69,435,119,1090,625,166,165,1319,913,11,1207,375,167,851,36,370,901,1113,1275,1314,632,664,1060,1151,1276,256,90,220,911,672,1208,914,410,1320,1312,1134,187,718,915,916,831,630,1102,912,1073,45,833,1122,44,920,279,1310,832,710,1196,962,662,112,1321,12,668,1121,1189,722,1213,1174,673,1311,801,852,183,520,216,51,999,1188,1305,1285,910,237,1002,727,629,149,921,972,369,1099,1109,627,1061,998,47,711,1000,1001,996,411,144,400,476,1112,1283,1095,182,1303,363,732,665,63,1055,221,1108,408,38,802,1307,157,233,280,1210,670,442,666,871,890,462,1044,1082,1304,1211,1341,1323,1187,39,1306,1191,1175,1346,1288,1294,1286,1289,1308,1104,1290,1292,1326,1295,1132,1327,1328,856,1343,1291,1045,1344,1345,905]
# Enter your api key here
apikey = ""
# Goes through every title in that decade (as of April 2018) and gets max number count. Change list to correct decade or custom list
for item in list_pre_1840:
requestURL = "http://api.trove.nla.gov.au/result?&q=+&zone=newspaper&l-title=" + str(item) + "&s=1&n=1&sortby=dateasc&key=" + apikey
response = requests.get(requestURL)
# Gets XML data to find current max article count
newtext = response.text
data = newtext.encode('ascii', 'ignore').decode('ascii')
with open('temp.xml', 'w') as f:
f.write(data)
# Creates variable of current max article count
root = ET.parse('temp.xml').getroot()
for records in root.iter('records'):
max_articles = records.attrib['total']
# Goes through each page of that title and collects XML
number = 0
while number < int(max_articles):
# Make sure to change your API key at the end of the URL
urltext = "http://api.trove.nla.gov.au/result?&q=+&zone=newspaper&include=articletext&l-title=" + str(item) + "&s=" + str(number) + "&n=100&sortby=dateasc&key=pcn9hh0qehlhv0n2"
response = requests.get(urltext)
newtext = response.text
data = newtext.encode('ascii', 'ignore').decode('ascii')
with open(str(number) + '.xml', 'w') as f:
f.write(data)
print(str(number) + "-" + str(number+100) + " out of " + str(max_articles) + " collected\n")
number = number+100
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment