a somewhat functioning crawler - don't use yet!

This commit is contained in:
Thomas Faour 2025-03-30 22:18:36 -04:00
commit a17b1a6eea
10 changed files with 30313 additions and 0 deletions

5
app.py Normal file
View File

@ -0,0 +1,5 @@
from crawler import Crawler
c = Crawler("https://wikipedia.com", "test_db.db")
c.start()

1
crawler/__init__.py Normal file
View File

@ -0,0 +1 @@
from .crawler import Crawler

Binary file not shown.

Binary file not shown.

129
crawler/crawler.py Normal file
View File

@ -0,0 +1,129 @@
import requests
import sqlite3
from bs4 import BeautifulSoup
from pathlib import Path
db_schema = """CREATE TABLE IF NOT EXISTS my_table (
ID INTEGER PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
times_visited INTEGER DEFAULT 0
);
"""
class CrawlerDB:
def __init__(self, database):
self.conn = None
if(not Path(database).exists()):
self.conn = sqlite3.connect(Path(database))
cur = self.conn.cursor()
cur.execute(db_schema)
self.conn.commit()
else:
self.conn = sqlite3.connect(database)
def add_url(self, url):
if not self.conn:
raise ValueError("Must init database first")
cursor = self.conn.cursor()
try:
# Try to insert the URL with times_visited = 1
cursor.execute('''
INSERT INTO my_table (url, times_visited)
VALUES (?, 1)
''', (url,))
except sqlite3.IntegrityError:
# If the URL already exists, update times_visited
cursor.execute('''
UPDATE my_table
SET times_visited = times_visited + 1
WHERE url = ?
''', (url,))
self.conn.commit()
def get_times_visited(self, url):
# Query the database for the times_visited value
cursor = self.conn.cursor()
cursor.execute('''
SELECT times_visited FROM my_table WHERE url = ?
''', (url,))
result = cursor.fetchone()
# Return the value if found, otherwise return 0
if result:
return result[0]
else:
return 0
REQUESTS_ERRORS = (
requests.exceptions.MissingSchema,
requests.exceptions.InvalidSchema,
requests.exceptions.InvalidURL,
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout
)
class Crawler:
def __init__(self, starting_url, database):
self.url_list = set()
self.starting_url = starting_url
self.db = CrawlerDB(database)
self.list_size_history = [1]
self.url_list.add(starting_url)
def _crawl(self):
while True:
if len(self.list_size_history) % 100 == 0:
#every 100, save all to file and purge
file_path = Path("size_history2")
with file_path.open(mode="a", encoding="utf-8") as file:
for item in self.list_size_history:
file.write(f"{item}\n")
self.list_size_history.clear()
try:
self._add_url(self.url_list.pop())
except KeyError as e:
break
def start(self):
self._crawl()
def _add_url(self, url):
if (self.db.get_times_visited(url) == 0 or url == self.starting_url) and len(self.url_list) < 30000:
for u in self._parse_url(url):
self.url_list.add(u)
self.db.add_url(url)
def _parse_url(self, url):
"""
Returns list of links from url
"""
req = None
l = len(self.url_list)
print(f"Currently parsing: {url} |#| {l} URLs in my list.")
self.list_size_history.append(l)
if("https://https://" in url):
#something has gone horribly wrong
return []
try:
req = requests.get(url, timeout=0.5)
except REQUESTS_ERRORS as e:
if(len(url) > 7 and url[:7] == "https://"):
return []
else:
if(len(url) > 2 and url[:2] == "//"):
self._parse_url("https:" + url)
else:
self._parse_url("https://" + url)
if (req == None):
return []
soup = BeautifulSoup(req.text, 'html.parser')
urls = [a['href'] for a in soup.find_all('a', href=True)]
return urls

878
index.html Normal file

File diff suppressed because one or more lines are too long

2200
oldsizehist2 Normal file

File diff suppressed because it is too large Load Diff

2800
size_history Normal file

File diff suppressed because it is too large Load Diff

24300
size_history2 Normal file

File diff suppressed because it is too large Load Diff

BIN
test_db.db Normal file

Binary file not shown.