ows-master/webcrawler studieprogramma's/login.py
2024-11-18 11:59:21 +01:00

63 lines
1.9 KiB
Python

import asyncio
from pyppeteer import launch
import logging
logging.basicConfig(level=logging.INFO)
async def crawl(url):
try:
# Launch a new Chromium browser with a visible window
print('browser launching')
browser = await launch(headless=False)
# Open a new page
page = await browser.newPage()
print('browser opened')
try:
# Navigate to the specified URL
await page.goto(url)
logging.info(f"Accessed {url}")
except Exception as e:
logging.error(f"Failed to navigate to {url}: {e}")
await browser.close()
return
try:
# Wait for the page to fully load
await page.waitForSelector('body')
except Exception as e:
logging.error(f"Failed to load the page properly: {e}")
await browser.close()
return
try:
# Extract the content of the page
content = await page.content()
# (Optional) Extract and print all links as an example
links = await page.evaluate('''() => {
return Array.from(document.querySelectorAll('a')).map(link => ({
text: link.innerText,
url: link.href
}));
}''')
for link in links:
print(f"Link text: {link['text']}, URL: {link['url']}")
except Exception as e:
logging.error(f"Error extracting or processing the content: {e}")
finally:
# Ensure the browser closes after execution
await browser.close()
except Exception as e:
logging.critical(f"Critical error occurred: {e}")
# Specify the URL of the web page you want to crawl
url = 'https://www.google.com/'
# Run the crawl function
asyncio.get_event_loop().run_until_complete(crawl(url))