63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
import asyncio
|
|
from pyppeteer import launch
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
async def crawl(url):
|
|
try:
|
|
# Launch a new Chromium browser with a visible window
|
|
print('browser launching')
|
|
browser = await launch(headless=False)
|
|
# Open a new page
|
|
page = await browser.newPage()
|
|
print('browser opened')
|
|
|
|
try:
|
|
# Navigate to the specified URL
|
|
await page.goto(url)
|
|
logging.info(f"Accessed {url}")
|
|
except Exception as e:
|
|
logging.error(f"Failed to navigate to {url}: {e}")
|
|
await browser.close()
|
|
return
|
|
|
|
try:
|
|
# Wait for the page to fully load
|
|
await page.waitForSelector('body')
|
|
except Exception as e:
|
|
logging.error(f"Failed to load the page properly: {e}")
|
|
await browser.close()
|
|
return
|
|
|
|
try:
|
|
# Extract the content of the page
|
|
content = await page.content()
|
|
# (Optional) Extract and print all links as an example
|
|
links = await page.evaluate('''() => {
|
|
return Array.from(document.querySelectorAll('a')).map(link => ({
|
|
text: link.innerText,
|
|
url: link.href
|
|
}));
|
|
}''')
|
|
|
|
for link in links:
|
|
print(f"Link text: {link['text']}, URL: {link['url']}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error extracting or processing the content: {e}")
|
|
|
|
finally:
|
|
# Ensure the browser closes after execution
|
|
await browser.close()
|
|
|
|
except Exception as e:
|
|
logging.critical(f"Critical error occurred: {e}")
|
|
|
|
|
|
# Specify the URL of the web page you want to crawl
|
|
url = 'https://www.google.com/'
|
|
|
|
# Run the crawl function
|
|
asyncio.get_event_loop().run_until_complete(crawl(url)) |