Python Script for archive.org
Posted: Wed Feb 28, 2024 4:00 pm
Hey,
Thought I would share. I wrote this script a while ago to save all the stories from a archive.org snapshot.
It don't spend a lot of time making it clean/less hacky but if you want to save all of archive.org snapshot to a file it will do it for you.
It takes a while and you can add a url to the last_url variable if you have to run it several times.
Just change the p variable to the output file path.
import requests
from bs4 import BeautifulSoup
dl = '-----------------------------------------------'
p = r"C:\...PUT YOUR PATH HERE\out.txt"
f = open(p, 'w', encoding="utf-8")
#can be whatever snapshot url you want
r = requests.get(r"https://web.archive.org/web/20110107120 ... al=stories")
b = r.text
s = BeautifulSoup(b)
a = []
st = []
l1 = s.find_all('td',{'class':['cellodd','celleven']})
last_url = None
capture = False
for l in l1:
try:
c = l.contents[0]
a.append(c)
except:
pass
for l in a:
c = l.attrs['href']
r2 = requests.get(c)
b2 = r2.text
s2 = BeautifulSoup(b2)
ls = s2.find_all('a', href=True)
for e in ls:
if e.attrs['href'] == last_url or last_url == None:
capture = True
t = None
if capture == True:
try:
rt = requests.get(e.attrs['href'])
rtb = rt.text
rtbs = BeautifulSoup(rtb)
t = rtbs.text
st.append(t)
except:
pass
if t != None:
f.write(t)
f.close
f = open(p, 'a', encoding="utf-8")
alls = dl.join(st)
f.write(alls)
f.close
print('d')
Thought I would share. I wrote this script a while ago to save all the stories from a archive.org snapshot.
It don't spend a lot of time making it clean/less hacky but if you want to save all of archive.org snapshot to a file it will do it for you.
It takes a while and you can add a url to the last_url variable if you have to run it several times.
Just change the p variable to the output file path.
import requests
from bs4 import BeautifulSoup
dl = '-----------------------------------------------'
p = r"C:\...PUT YOUR PATH HERE\out.txt"
f = open(p, 'w', encoding="utf-8")
#can be whatever snapshot url you want
r = requests.get(r"https://web.archive.org/web/20110107120 ... al=stories")
b = r.text
s = BeautifulSoup(b)
a = []
st = []
l1 = s.find_all('td',{'class':['cellodd','celleven']})
last_url = None
capture = False
for l in l1:
try:
c = l.contents[0]
a.append(c)
except:
pass
for l in a:
c = l.attrs['href']
r2 = requests.get(c)
b2 = r2.text
s2 = BeautifulSoup(b2)
ls = s2.find_all('a', href=True)
for e in ls:
if e.attrs['href'] == last_url or last_url == None:
capture = True
t = None
if capture == True:
try:
rt = requests.get(e.attrs['href'])
rtb = rt.text
rtbs = BeautifulSoup(rtb)
t = rtbs.text
st.append(t)
except:
pass
if t != None:
f.write(t)
f.close
f = open(p, 'a', encoding="utf-8")
alls = dl.join(st)
f.write(alls)
f.close
print('d')