a somewhat functioning crawler - don't use yet!
This commit is contained in:
commit
a17b1a6eea
5
app.py
Normal file
5
app.py
Normal file
@ -0,0 +1,5 @@
|
||||
from crawler import Crawler
|
||||
|
||||
c = Crawler("https://wikipedia.com", "test_db.db")
|
||||
|
||||
c.start()
|
1
crawler/__init__.py
Normal file
1
crawler/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .crawler import Crawler
|
BIN
crawler/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
crawler/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
crawler/__pycache__/crawler.cpython-312.pyc
Normal file
BIN
crawler/__pycache__/crawler.cpython-312.pyc
Normal file
Binary file not shown.
129
crawler/crawler.py
Normal file
129
crawler/crawler.py
Normal file
@ -0,0 +1,129 @@
|
||||
import requests
|
||||
import sqlite3
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
|
||||
db_schema = """CREATE TABLE IF NOT EXISTS my_table (
|
||||
ID INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
times_visited INTEGER DEFAULT 0
|
||||
);
|
||||
"""
|
||||
|
||||
class CrawlerDB:
|
||||
def __init__(self, database):
|
||||
self.conn = None
|
||||
if(not Path(database).exists()):
|
||||
self.conn = sqlite3.connect(Path(database))
|
||||
cur = self.conn.cursor()
|
||||
cur.execute(db_schema)
|
||||
self.conn.commit()
|
||||
else:
|
||||
self.conn = sqlite3.connect(database)
|
||||
|
||||
def add_url(self, url):
|
||||
if not self.conn:
|
||||
raise ValueError("Must init database first")
|
||||
cursor = self.conn.cursor()
|
||||
try:
|
||||
# Try to insert the URL with times_visited = 1
|
||||
cursor.execute('''
|
||||
INSERT INTO my_table (url, times_visited)
|
||||
VALUES (?, 1)
|
||||
''', (url,))
|
||||
except sqlite3.IntegrityError:
|
||||
# If the URL already exists, update times_visited
|
||||
cursor.execute('''
|
||||
UPDATE my_table
|
||||
SET times_visited = times_visited + 1
|
||||
WHERE url = ?
|
||||
''', (url,))
|
||||
self.conn.commit()
|
||||
|
||||
def get_times_visited(self, url):
|
||||
# Query the database for the times_visited value
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT times_visited FROM my_table WHERE url = ?
|
||||
''', (url,))
|
||||
result = cursor.fetchone()
|
||||
|
||||
# Return the value if found, otherwise return 0
|
||||
if result:
|
||||
return result[0]
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
REQUESTS_ERRORS = (
|
||||
requests.exceptions.MissingSchema,
|
||||
requests.exceptions.InvalidSchema,
|
||||
requests.exceptions.InvalidURL,
|
||||
requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ReadTimeout
|
||||
|
||||
)
|
||||
class Crawler:
|
||||
|
||||
def __init__(self, starting_url, database):
|
||||
self.url_list = set()
|
||||
self.starting_url = starting_url
|
||||
self.db = CrawlerDB(database)
|
||||
self.list_size_history = [1]
|
||||
|
||||
self.url_list.add(starting_url)
|
||||
|
||||
def _crawl(self):
|
||||
while True:
|
||||
if len(self.list_size_history) % 100 == 0:
|
||||
#every 100, save all to file and purge
|
||||
file_path = Path("size_history2")
|
||||
with file_path.open(mode="a", encoding="utf-8") as file:
|
||||
for item in self.list_size_history:
|
||||
file.write(f"{item}\n")
|
||||
self.list_size_history.clear()
|
||||
|
||||
try:
|
||||
self._add_url(self.url_list.pop())
|
||||
except KeyError as e:
|
||||
break
|
||||
|
||||
def start(self):
|
||||
self._crawl()
|
||||
|
||||
def _add_url(self, url):
|
||||
if (self.db.get_times_visited(url) == 0 or url == self.starting_url) and len(self.url_list) < 30000:
|
||||
for u in self._parse_url(url):
|
||||
self.url_list.add(u)
|
||||
self.db.add_url(url)
|
||||
|
||||
|
||||
def _parse_url(self, url):
|
||||
"""
|
||||
Returns list of links from url
|
||||
"""
|
||||
req = None
|
||||
l = len(self.url_list)
|
||||
print(f"Currently parsing: {url} |#| {l} URLs in my list.")
|
||||
self.list_size_history.append(l)
|
||||
if("https://https://" in url):
|
||||
#something has gone horribly wrong
|
||||
return []
|
||||
try:
|
||||
req = requests.get(url, timeout=0.5)
|
||||
except REQUESTS_ERRORS as e:
|
||||
if(len(url) > 7 and url[:7] == "https://"):
|
||||
return []
|
||||
else:
|
||||
if(len(url) > 2 and url[:2] == "//"):
|
||||
self._parse_url("https:" + url)
|
||||
else:
|
||||
self._parse_url("https://" + url)
|
||||
if (req == None):
|
||||
return []
|
||||
soup = BeautifulSoup(req.text, 'html.parser')
|
||||
urls = [a['href'] for a in soup.find_all('a', href=True)]
|
||||
return urls
|
||||
|
||||
|
||||
|
878
index.html
Normal file
878
index.html
Normal file
File diff suppressed because one or more lines are too long
2200
oldsizehist2
Normal file
2200
oldsizehist2
Normal file
File diff suppressed because it is too large
Load Diff
2800
size_history
Normal file
2800
size_history
Normal file
File diff suppressed because it is too large
Load Diff
24300
size_history2
Normal file
24300
size_history2
Normal file
File diff suppressed because it is too large
Load Diff
BIN
test_db.db
Normal file
BIN
test_db.db
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user