Commit 6e165826 authored by Thomas Huetter's avatar Thomas Huetter
Browse files

fixed names in sentiment and bolzano streets

parent db8be93b
Loading
Loading
Loading
Loading
+3 −3
Original line number Original line Diff line number Diff line
@@ -23,11 +23,11 @@ cd original_data


# prepare data for file L.trees
# prepare data for file L.trees
#     convert file into UTF-8 format   | remove header |  remove IDs   | sort by number of nodes (equivalent to number of "{")
#     convert file into UTF-8 format   | remove header |  remove IDs   | sort by number of nodes (equivalent to number of "{")
iconv -f ISO-8859-1 -t "UTF-8" L.trees | tail -n +14   | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../L_preprocessed.bracket
iconv -f ISO-8859-1 -t "UTF-8" L.trees | tail -n +14   | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../L_preprocessed.txt


# prepare data for file R.trees
# prepare data for file R.trees
#     convert file into UTF-8 format   | remove header |  remove IDs   | sort by number of nodes (equivalent to number of "{")
#     convert file into UTF-8 format   | remove header |  remove IDs   | sort by number of nodes (equivalent to number of "{")
iconv -f ISO-8859-1 -t "UTF-8" R.trees | tail -n +14   | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../R_preprocessed.bracket
iconv -f ISO-8859-1 -t "UTF-8" R.trees | tail -n +14   | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../R_preprocessed.txt


# prepare dataset with a single label
# prepare dataset with a single label
#                         | remove non-bracket chars. | add single dummy label 'o' > save to file
#                         | remove non-bracket chars. | add single dummy label 'o' > save to file
+2 −2
Original line number Original line Diff line number Diff line
@@ -23,7 +23,7 @@ cd trees


# prepare data for file L.trees
# prepare data for file L.trees
# convert dev.txt and train.txt into UTF-8 format |  replace ( by {  |   replace ) by } |   remove whitespace before '{'   | sort by number of nodes (equivalent to number of "{")
# convert dev.txt and train.txt into UTF-8 format |  replace ( by {  |   replace ) by } |   remove whitespace before '{'   | sort by number of nodes (equivalent to number of "{")
iconv -f ISO-8859-1 -t "UTF-8" dev.txt train.txt  | sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../sentiment.bracket
iconv -f ISO-8859-1 -t "UTF-8" dev.txt train.txt  | sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > ../sentiment_sorted.bracket


# go back to the folder
# go back to the folder
cd ..
cd ..