Querying

minestrone allows searching through HTML via CSS selectors (similar to JQuery or other frontend libraries).

root_element

Gets the root element of the HTML.

from minestrone import HTML
html = HTML("""
<div>
  <span>Dormouse</span>
</div>
""")

assert html.root_element.name == "div"

query

Takes a CSS selector and returns an iterator of Element items.

Query by element name

from minestrone import HTML
html = HTML("""
<h1>The Dormouse's Story</h1>
<p>There was a table...</p>
""")

for h1 in html.query("h1"):
    assert str(h1) == "<h1>The Dormouse's Story</h1>"

Query by id

from minestrone import HTML
html = HTML("""
<ul>
  <li><a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a></li>
  <li><a href="http://example.com/lacie" class="sister" id="lacie">Lacie</a></li>
</ul>
""")

for a in html.query("a#elsie"):
    assert str(a) == '<a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a>'

Query by class

from minestrone import HTML
html = HTML("""
<ul>
  <li><a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a></li>
  <li><a href="http://example.com/lacie" class="sister" id="lacie">Lacie</a></li>
</ul>
""")

elsie_link = next(html.query("ul li a.sister"))
assert str(elsie_link) == '<a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a>'

lacie_link = next(html.query("ul li a.sister"))
assert str(lacie_link) == '<a href="http://example.com/lacie" class="sister" id="lacie">Lacie</a>'

query_to_list

Exactly the same as query except it returns a list of Element items instead of a generator. This is sometimes more useful than the query above, but it can take more time to parse and more memory to store the data if the HTML document is large.

from minestrone import HTML
html = HTML("""
<ul>
  <li><a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a></li>
  <li><a href="http://example.com/lacie" class="sister" id="lacie">Lacie</a></li>
</ul>
""")

assert len(html.query_to_list("a")) == 2
assert str(html.query_to_list("a")[0]) == '<a href="http://example.com/elsie" class="sister" id="elsie">Elsie</a>'
assert html.query_to_list("a") == list(html.query("a"))