Skip to content

Instantly share code, notes, and snippets.

@arthurdarcet
Last active December 20, 2015 03:09
Show Gist options
  • Save arthurdarcet/6061186 to your computer and use it in GitHub Desktop.
Save arthurdarcet/6061186 to your computer and use it in GitHub Desktop.
Weird BeautifulSoup bug
import bs4
def working_test(merge_html, main_html):
main = bs4.BeautifulSoup(main_html)
merge = bs4.BeautifulSoup(merge_html)
for tag in list(merge.body.contents):
main.body.append(tag)
print(main)
def broken_test(merge_html, main_html):
main = bs4.BeautifulSoup(main_html)
merge = bs4.BeautifulSoup(merge_html)
for tag in merge.body.contents:
print('--')
print(repr(tag))
main.body.append(tag)
print('----')
print(main)
merge = """
<body>
<div>Won't be merged anyway</div><div>Will be</div>
<div>Will be merged only if the previous tag is merged</div>
Will be merged only if the previous tag isn't
</body>
"""
merge2 = """
<body>
<h1>Won't be merged anyway</h1>
<div>Won't be</div>
<div>Will be merged only if the previous tag is merged</div>
Will be merged only if the previous tag isn't
</body>
"""
print('--- Merge 1')
broken_test(merge, '<body>Main body</body>')
print('\n--- Merge 2')
broken_test(merge2, '<body>Main body</body>')
@arthurdarcet
Copy link
Author

Output:

--- Merge 1
--
'\n'
--
<div>Will be</div>
--
<div>Will be merged only if the previous tag is merged</div>
----
<body>Main body
<div>Will be</div><div>Will be merged only if the previous tag is merged</div></body>

--- Merge 2
--
'\n'
--
'\n'
--
'\n'
--
"\n    Will be merged only if the previous tag isn't\n  "
----
<body>Main body



    Will be merged only if the previous tag isn't
  </body>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment