Similar to
QDir and QDirIterator ignore files with non-ASCII filenames
and
UnicodeEncodeError: 'latin-1' codec can't encode character
With regard to the second link above, I added test0() below. My understanding was that utf-8 was the solution I was searching for, but alas trying to encode the filename fails.
def test0():
print("test0...using unicode literal")
name = u"123c\udcb4.wav"
test("test0b", name)
n = name.encode('utf-8')
print(n)
n = QtCore.QFile.decodeName(n)
print(n)
# From http://docs.python.org/release/3.0.1/howto/unicode.html
# This will indeed overwrite the correct file!
# f = open(name, 'w')
# f.write('blah\n')
# f.close()
Test0 results...
test0...using unicode literal
test0b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test0b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test0b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test0b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
Traceback (most recent call last):
File "unicode.py", line 157, in <module>
test0()
File "unicode.py", line 42, in test0
n = name.encode('utf-8')
UnicodeEncodeError: 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
EDIT
Further reading from https://www.rfc-editor.org/rfc/rfc3629 tells me that "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF". So if uft-8 doesn't allow these characters. How are you supposed to deal with a file that is so named? Python can create and test existence for them. So this points me at an issue with my Qt api usage or the Qt api itself?!
I am struggling to wrap my head around proper handling of unicode file name in Python3. Ultimately, I'm working on a Phonon based music player. I've tried to isolate the problem(s) from that as much as possible. From the code below you will see that I've tried as many alternatives as I can find. My initial response is that there are bugs here....maybe mine...maybe in one or more libraries. Any help would be much appreciated!
I have a directory with 3 unicode file names 123[abc]U.wav. The first 2 files are handled properly...mostly...the third one 123c is just wrong.
from PyQt4 import QtGui, QtCore
import sys, os
def test(_name, _file):
# print(_name, repr(_file))
f = QtCore.QFile(_file)
# f = QtCore.QFile(QtCore.QFile.decodeName(test))
exists = f.exists()
try:
print(_name, "QFile.exists", f.fileName(), exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
fileInfo = QtCore.QFileInfo(_file)
exists = fileInfo.exists()
try:
print(_name, "QFileInfo.exists", fileInfo.fileName(), exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
exists = os.path.exists(_file)
try:
print(_name, "os.path.exists", _file, exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
exists = os.path.isfile(_file)
try:
print(_name, "os.path.isfile", _file, exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
print()
def test1():
args = QtGui.QApplication.arguments()
print("test1...using QtGui.QApplication.arguments()")
test("test1", args[1])
def test2():
print("test2...using sys.argv")
test("test2", sys.argv[1])
def test3():
print("test3...QtGui.QFileDialog.getOpenFileName()")
name = QtGui.QFileDialog.getOpenFileName()
test("test3", name)
def test4():
print("test4...QtCore.QDir().entryInfoList()")
p = os.path.abspath(__file__)
p, _ = os.path.split(p)
d = QtCore.QDir(p)
for inf in d.entryInfoList(QtCore.QDir.AllEntries|QtCore.QDir.NoDotAndDotDot|QtCore.QDir.System):
print("test4", inf.fileName())
# if str(inf.fileName()).startswith("123c"):
if u"123c\ufffd.wav" == inf.fileName():
# if u"123c\udcb4.wav" == inf.fileName(): # This check fails..even tho that is what is reported in error messages for test2
test("test4a", inf.fileName())
test("test4b", inf.absoluteFilePath())
def test5():
print("test5...os.listdir()")
p = os.path.abspath(__file__)
p, _ = os.path.split(p)
dirList = os.listdir(p)
for file in dirList:
fullfile = os.path.join(p, file)
try:
print("test5", file)
except UnicodeEncodeError as e:
print(e)
print("test5", repr(fullfile))
# if u"123c\ufffd.wav" == file: # This check fails..even tho it worked in test4
if u"123c\udcb4.wav" == file:
test("test5a", file)
test("test5b", fullfile)
print()
def test6():
print("test6...Phonon and QtGui.QFileDialog.getOpenFileName()")
from PyQt4.phonon import Phonon
class Window(QtGui.QDialog):
def __init__(self):
QtGui.QDialog.__init__(self, None)
self.mediaObject = Phonon.MediaObject(self)
self.audioOutput = Phonon.AudioOutput(Phonon.MusicCategory, self)
Phonon.createPath(self.mediaObject, self.audioOutput)
self.mediaObject.stateChanged.connect(self.handleStateChanged)
name = QtGui.QFileDialog.getOpenFileName()# works with python3..not for 123c
# name = QtGui.QApplication.arguments()[1] # works with python2..but not python3...not for 123c
# name = sys.argv[1] # works with python3..but not python2...not for 123c
# p = os.path.abspath(__file__)
# p, _ = os.path.split(p)
# print(p)
# name = os.path.join(p, str(name))
self.mediaObject.setCurrentSource(Phonon.MediaSource(name))
self.mediaObject.play()
def handleStateChanged(self, newstate, oldstate):
if newstate == Phonon.PlayingState:
source = self.mediaObject.currentSource().fileName()
print('test6 playing: :', source)
elif newstate == Phonon.StoppedState:
source = self.mediaObject.currentSource().fileName()
print('test6 stopped: :', source)
elif newstate == Phonon.ErrorState:
source = self.mediaObject.currentSource().fileName()
print('test6 ERROR: could not play:', source)
win = Window()
win.resize(200, 100)
# win.show()
win.exec_()
def timerTick():
QtGui.QApplication.exit()
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
app.setApplicationName('unicode_test')
test1()
test2()
test3()
test4()
test5()
test6()
timer = QtCore.QTimer()
timer.timeout.connect(timerTick)
timer.start(1)
sys.exit(app.exec_())
Test results with 123a...
python3 unicode.py 123a�.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists unknown False
test1 QFileInfo.exists unknown False
test1 os.path.exists unknown False
test1 os.path.isfile unknown False
test2...using sys.argv
test2 QFile.exists 123a�.wav True
test2 QFileInfo.exists 123a�.wav True
test2 os.path.exists 123a�.wav True
test2 os.path.isfile 123a�.wav True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123a�.wav True
test3 QFileInfo.exists 123a�.wav True
test3 os.path.exists /home/mememe/Desktop/test/unicode/123a�.wav True
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123a�.wav True
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123a�.wav
test6 playing: : /home/mememe/Desktop/test/unicode/123a�.wav
test6 stopped: : /home/mememe/Desktop/test/unicode/123a�.wav
Test results with 123b...
python3 unicode.py 123bÆ.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists 123b.wav False
test1 QFileInfo.exists 123b.wav False
test1 os.path.exists 123b.wav False
test1 os.path.isfile 123b.wav False
test2...using sys.argv
test2 QFile.exists 123bÆ.wav True
test2 QFileInfo.exists 123bÆ.wav True
test2 os.path.exists 123bÆ.wav True
test2 os.path.isfile 123bÆ.wav True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123bÆ.wav True
test3 QFileInfo.exists 123bÆ.wav True
test3 os.path.exists /home/mememe/Desktop/test/unicode/123bÆ.wav True
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123bÆ.wav True
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123bÆ.wav
test6 playing: : /home/mememe/Desktop/test/unicode/123bÆ.wav
test6 stopped: : /home/mememe/Desktop/test/unicode/123bÆ.wav
Test results with 123c...
python3 unicode.py 123c�.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists unknown False
test1 QFileInfo.exists unknown False
test1 os.path.exists unknown False
test1 os.path.isfile unknown False
test2...using sys.argv
test2 QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test2 QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test2 os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test2 os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test3 QFileInfo.exists 123c�.wav False
test3 os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123c�.wav
Interesting things to note about the test results...
- Test1 failed for all 3 files.
- Test2 passed for all 3 files...except for the QFile and QFileInfo tests for 123c
- Test3 passed for 123a and 123b but failed for 123c
- Test4 ...QDir found all 4 files in the directory
- Test4a and Test4b failed for all files
- Test5 ...os.listdir found all 4 files in the directory
- NOTE: The Test5a and test5b checks had to use a different unicode check?!
- Test5a and Test5b failed the QFile and QfileInfo tests, but passed the os.path checks.
- Test6 passed for 123a and 123b, but failed for 123c...the phonon player got a stopped only message vs the stopped playing stopped the 123a and 123b files got.
I know that is a lot of information...I wast trying to be thorough.
So, if there is one final question is what is the right way to deal with unicode file names in Python3?
name = u"123c\udcb4.wav"
test("test0b", name)
f = open(name, 'w')
f.write('blah\n') f.close() ` ...which will create an invalid utf-8 named file. Should it? – Bickerstaff