a somewhat functioning crawler - don't use yet!

2025-03-30 22:18:36 -04:00 · 2025-03-30 22:18:36 -04:00 · a17b1a6eea
commit a17b1a6eea
10 changed files with 30313 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -0,0 +1,5 @@
+from crawler import Crawler
+
+c = Crawler("https://wikipedia.com", "test_db.db")
+
+c.start()
--- a/crawler/init.py
+++ b/crawler/init.py
@ -0,0 +1 @@
+from .crawler import Crawler
--- a/crawler/pycache/init.cpython-312.pyc
+++ b/crawler/pycache/init.cpython-312.pyc
--- a/crawler/pycache/crawler.cpython-312.pyc
+++ b/crawler/pycache/crawler.cpython-312.pyc
--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -0,0 +1,129 @@
+import requests
+import sqlite3
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+db_schema = """CREATE TABLE IF NOT EXISTS my_table (
+    ID INTEGER PRIMARY KEY,
+    url TEXT NOT NULL UNIQUE,
+    times_visited INTEGER DEFAULT 0
+);
+"""
+
+class CrawlerDB:
+    def __init__(self, database):
+        self.conn = None
+        if(not Path(database).exists()):
+            self.conn = sqlite3.connect(Path(database))
+            cur = self.conn.cursor()
+            cur.execute(db_schema)
+            self.conn.commit()
+        else:
+            self.conn = sqlite3.connect(database)
+
+    def add_url(self, url):
+        if not self.conn:
+            raise ValueError("Must init database first")
+        cursor = self.conn.cursor()
+        try:
+            # Try to insert the URL with times_visited = 1
+            cursor.execute('''
+            INSERT INTO my_table (url, times_visited)
+            VALUES (?, 1)
+            ''', (url,))
+        except sqlite3.IntegrityError:
+            # If the URL already exists, update times_visited
+            cursor.execute('''
+            UPDATE my_table
+            SET times_visited = times_visited + 1
+            WHERE url = ?
+            ''', (url,))
+        self.conn.commit()
+    
+    def get_times_visited(self, url):
+        # Query the database for the times_visited value
+        cursor = self.conn.cursor()
+        cursor.execute('''
+        SELECT times_visited FROM my_table WHERE url = ?
+        ''', (url,))
+        result = cursor.fetchone()
+        
+        # Return the value if found, otherwise return 0
+        if result:
+            return result[0]
+        else:
+            return 0
+
+
+REQUESTS_ERRORS = (
+    requests.exceptions.MissingSchema,
+    requests.exceptions.InvalidSchema, 
+    requests.exceptions.InvalidURL, 
+    requests.exceptions.ConnectionError,
+    requests.exceptions.ReadTimeout
+
+)
+class Crawler:
+
+    def __init__(self, starting_url, database):
+        self.url_list = set()
+        self.starting_url = starting_url
+        self.db = CrawlerDB(database)
+        self.list_size_history = [1]
+
+        self.url_list.add(starting_url)
+
+    def _crawl(self):
+        while True:
+            if len(self.list_size_history) % 100 == 0:
+                #every 100, save all to file and purge
+                file_path = Path("size_history2")
+                with file_path.open(mode="a", encoding="utf-8") as file:
+                    for item in self.list_size_history:
+                        file.write(f"{item}\n")
+                self.list_size_history.clear()                
+
+            try:
+                self._add_url(self.url_list.pop())
+            except KeyError as e:
+                break
+    
+    def start(self):
+        self._crawl()
+
+    def _add_url(self, url):
+        if (self.db.get_times_visited(url) == 0 or url == self.starting_url) and len(self.url_list) < 30000:
+            for u in self._parse_url(url):
+                self.url_list.add(u)
+        self.db.add_url(url)
+
+
+    def _parse_url(self, url):
+        """
+        Returns list of links from url
+        """
+        req = None
+        l = len(self.url_list)
+        print(f"Currently parsing: {url} |#| {l} URLs in my list.")
+        self.list_size_history.append(l)
+        if("https://https://" in url):
+            #something has gone horribly wrong
+            return []
+        try:
+            req = requests.get(url, timeout=0.5)
+        except REQUESTS_ERRORS as e:
+            if(len(url) > 7 and url[:7] == "https://"):
+                return []
+            else:
+                if(len(url) > 2 and url[:2] == "//"):
+                    self._parse_url("https:" + url)
+                else:
+                    self._parse_url("https://" + url)
+        if (req == None):
+            return []
+        soup = BeautifulSoup(req.text, 'html.parser')
+        urls = [a['href'] for a in soup.find_all('a', href=True)]
+        return urls
+        
+
+    
--- a/index.html
+++ b/index.html
--- a/2200
+++ b/2200
--- a/2800
+++ b/2800
--- a/24300
+++ b/24300
--- a/test_db.db
+++ b/test_db.db