first commit
This commit is contained in:
63
webcrawler studieprogramma's/login.py
Normal file
63
webcrawler studieprogramma's/login.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import asyncio
|
||||
from pyppeteer import launch
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
async def crawl(url):
|
||||
try:
|
||||
# Launch a new Chromium browser with a visible window
|
||||
print('browser launching')
|
||||
browser = await launch(headless=False)
|
||||
# Open a new page
|
||||
page = await browser.newPage()
|
||||
print('browser opened')
|
||||
|
||||
try:
|
||||
# Navigate to the specified URL
|
||||
await page.goto(url)
|
||||
logging.info(f"Accessed {url}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to navigate to {url}: {e}")
|
||||
await browser.close()
|
||||
return
|
||||
|
||||
try:
|
||||
# Wait for the page to fully load
|
||||
await page.waitForSelector('body')
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to load the page properly: {e}")
|
||||
await browser.close()
|
||||
return
|
||||
|
||||
try:
|
||||
# Extract the content of the page
|
||||
content = await page.content()
|
||||
# (Optional) Extract and print all links as an example
|
||||
links = await page.evaluate('''() => {
|
||||
return Array.from(document.querySelectorAll('a')).map(link => ({
|
||||
text: link.innerText,
|
||||
url: link.href
|
||||
}));
|
||||
}''')
|
||||
|
||||
for link in links:
|
||||
print(f"Link text: {link['text']}, URL: {link['url']}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error extracting or processing the content: {e}")
|
||||
|
||||
finally:
|
||||
# Ensure the browser closes after execution
|
||||
await browser.close()
|
||||
|
||||
except Exception as e:
|
||||
logging.critical(f"Critical error occurred: {e}")
|
||||
|
||||
|
||||
# Specify the URL of the web page you want to crawl
|
||||
url = 'https://www.google.com/'
|
||||
|
||||
# Run the crawl function
|
||||
asyncio.get_event_loop().run_until_complete(crawl(url))
|
||||
0
webcrawler studieprogramma's/main.py
Normal file
0
webcrawler studieprogramma's/main.py
Normal file
Reference in New Issue
Block a user